diff --git a/CMakeLists.txt b/CMakeLists.txt index 6290c3b614e77cd17522434cedee26ce6551fe8d..cd49df28f3d2414551672635f15cd8562f89da63 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,8 @@ cmake_minimum_required(VERSION 3.10) +# Include External Project options +include( ExternalProject ) + # set the project name project(sisso++ VERSION 0.1 LANGUAGES CXX) @@ -265,6 +268,43 @@ set(MPI_LIBRARIES, ${MPI_CXX_LIBRARIES}) list(GET MPI_CXX_LIBRARIES 0 MPI_LIBRARY) get_filename_component(MPI_DIR ${MPI_LIBRARY} DIRECTORY) +# Find the Shark libraries and includes +# set Shark_DIR to the proper location of Shark +# find_package(Shark) +# if(NOT SHARK_FOUND) + set(SHARK_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/shark/build/") + set(SHARK_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/shark/bin/") + set(SHARK_INCLUDE_DIRS "${SHARK_INSTALL_DIR}/include") + set(SHARK_LIBRARY_DIRS "${SHARK_INSTALL_DIR}/lib") + set(SHARK_CMAKE_ARGS "-DBUILD_EXAMPLES=OFF;-DBUILD_TESTING=OFF;-DBUILD_SHARED_LIBS=ON;-DENABLE_OPENMP=OFF;-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER};-DCMAKE_INSTALL_PREFIX=${SHARK_INSTALL_DIR};-DCBLAS_LIBRARY_PATH=${LAPACK_LIBRARY_DIR};") + ExternalProject_Add( + external_shark + PREFIX "external/shark" + GIT_REPOSITORY "https://github.com/Shark-ML/Shark.git" + GIT_TAG "4.0" + CMAKE_ARGS "${SHARK_CMAKE_ARGS}" + BINARY_DIR "${SHARK_BUILD_DIR}" + INSTALL_DIR "${SHARK_INSTALL_DIR}" + ) + add_dependencies(external_shark boost::filesystem) + add_dependencies(external_shark boost::system) + add_library( shark SHARED IMPORTED ) + set_property( TARGET shark PROPERTY IMPORTED_LOCATION ${SHARK_LIBRARY_DIRS}/libshark.so ) + set_property( TARGET shark PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${SHARK_INCLUDE_DIRS} ) + set_property( TARGET shark PROPERTY INTERFACE_LINK_LIBRARIES "${Boost_LIBRARIES};${LAPACK_LIBRARIES}" ) + add_dependencies(shark external_shark) + set(SHARK_LIBRARIES "${SHARK_LIBRARY_DIRS}/libshark.so") + # The Shark version number + set(SHARK_VERSION_MAJOR "4") + set(SHARK_VERSION_MINOR "0") + set(SHARK_VERSION_PATCH "0") + + include_directories(${SHARK_INCLUDE_DIRS}) +# else(NOT SHARK_FOUND) +# message(STATUS "SHARK_USE_FILE ${SHARK_USE_FILE}") +# include(${SHARK_USE_FILE}) +# endif() + include_directories(${CMAKE_CURRENT_LIST_DIR}/src) add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1faefe1329d14c3e29a50f29f3b8a491dd87a9a5..942abe34eaa3387a256873b2875d1d84c3b364cc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -13,7 +13,7 @@ include_directories(${CMAKE_CURRENT_LIST_DIR}/python/) include_directories(${CMAKE_CURRENT_LIST_DIR}/utils/) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations") -set(CMAKE_INSTALL_RPATH ${Boost_LIBRARY_DIRS};${LAPACK_DIR};${MPI_DIR}) +set(CMAKE_INSTALL_RPATH ${Boost_LIBRARY_DIRS};${LAPACK_DIR};${MPI_DIR};${SHARK_LIBRARY_DIRS}) # set(INSTALL_RPATH ${Boost_LIB_DIR}) file(GLOB_RECURSE SISSOPP_SOURCES *.cpp) @@ -31,12 +31,12 @@ set_target_properties(sisso++ RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" ) -target_link_libraries(sisso++ ${LAPACK_LIBRARIES} ${MPI_LIBRARIES} -Wl,--rpath=${Boost_LIB_DIR} -Wl,--rpath=${LAPACK_DIR} ${Boost_LIBRARIES}) +target_link_libraries(sisso++ ${LAPACK_LIBRARIES} ${MPI_LIBRARIES} -Wl,--rpath=${Boost_LIB_DIR} -Wl,--rpath=${LAPACK_DIR} ${Boost_LIBRARIES} ${SHARK_LIBRARIES}) install(TARGETS sisso++ DESTINATION ${CMAKE_CURRENT_LIST_DIR}/../bin/) if(USE_PYTHON) include(${CMAKE_CURRENT_LIST_DIR}/../cmake/TransferDocStrings.cmake) - set(CMAKE_INSTALL_RPATH "${Boost_LIBRARY_DIRS};${PYTHON_PREFIX}/lib/;${MPI_DIR}") + set(CMAKE_INSTALL_RPATH "${Boost_LIBRARY_DIRS};${PYTHON_PREFIX}/lib/;${MPI_DIR};${SHARK_LIBRARY_DIRS}") transfer_doc_string(${CMAKE_CURRENT_LIST_DIR}/python/bindings_docstring_keyed.cpp ${CMAKE_CURRENT_LIST_DIR}/python/bindings.cpp) transfer_doc_string(${CMAKE_CURRENT_LIST_DIR}/python/bindings_docstring_keyed.hpp ${CMAKE_CURRENT_LIST_DIR}/python/bindings.hpp) @@ -59,7 +59,7 @@ if(USE_PYTHON) PREFIX "" SUFFIX ".so" ) - target_link_libraries(_sisso ${MPI_LIBRARIES} -Wl,--rpath=${PYTHON_PREFIX}/lib/ ${LAPACK_LIBRARIES} ${PYTHON_LIBRARIES} -Wl,--rpath=${Boost_LIB_DIR} ${Boost_LIBRARIES} ${Boost_PYTHON_LIBRARIES}) + target_link_libraries(_sisso ${MPI_LIBRARIES} -Wl,--rpath=${PYTHON_PREFIX}/lib/ ${LAPACK_LIBRARIES} ${PYTHON_LIBRARIES} -Wl,--rpath=${Boost_LIB_DIR} ${Boost_LIBRARIES} ${Boost_PYTHON_LIBRARIES} ${SHARK_LIBRARIES}) install(TARGETS _sisso DESTINATION "${PYTHON_INSTDIR}/cpp_sisso") install( DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/python/ DESTINATION ${PYTHON_INSTDIR}/cpp_sisso diff --git a/src/descriptor_identifier/Model/Model.cpp b/src/descriptor_identifier/Model/Model.cpp index b44d9965baf26b3c6240dcf967ab768cfab1ba29..50f0316ffb4737ad706d8ce78ad9427857cc99ed 100644 --- a/src/descriptor_identifier/Model/Model.cpp +++ b/src/descriptor_identifier/Model/Model.cpp @@ -7,391 +7,13 @@ Model::Model(Unit prop_unit, std::vector<double> prop_train, std::vector<double> _feats(feats), _prop_train(prop_train), _prop_test(prop_test), - _train_error(_n_samp_train), - _test_error(_n_samp_test), _D_train(_n_samp_train * (feats.size() + (1 - fix_intercept))), _D_test(_n_samp_test * (feats.size() + (1 - fix_intercept))), - _prop_train_est(_n_samp_train, 0.0), - _prop_test_est(_n_samp_test, 0.0), _task_sizes_train(task_sizes_train), _task_sizes_test(task_sizes_test), _prop_unit(prop_unit), _fix_intercept(fix_intercept) -{ - _prop_train_est.reserve(_n_samp_train); - _prop_test_est.reserve(_n_samp_test); +{} - std::vector<double> a(_D_train.size(), 1.0); - std::vector<double> work(_D_train.size(), 0.0); - - int info; - int start = 0; - - for(auto& sz : _task_sizes_train) - { - std::fill_n(a.data(), a.size(), 1.0); - std::fill_n(_D_train.data(), _D_train.size(), 1.0); - - for(int ff = 0; ff < feats.size(); ++ff) - { - std::copy_n(feats[ff]->value_ptr() + start, sz, _D_train.data() + ff * sz); - std::copy_n(feats[ff]->value_ptr() + start, sz, a.data() + ff * sz); - } - - dgels_('N', sz, _n_dim + 1 - _fix_intercept, 1, a.data(), sz, prop_train.data() + start, sz, work.data(), work.size(), &info); - - _coefs.push_back(std::vector<double>(_n_dim + (!fix_intercept), 0.0)); - - std::copy_n(prop_train.begin() + start, _n_dim + 1 - _fix_intercept, _coefs.back().data()); - dgemv_('N', sz, _n_dim + 1 - _fix_intercept, 1.0, _D_train.data(), sz, _coefs.back().data(), 1, 0.0, _prop_train_est.data() + start, 1); - std::transform(_prop_train_est.begin() + start, _prop_train_est.begin() + start + sz, _prop_train.data() + start, _train_error.data() + start, std::minus<double>()); - - start += sz; - } - - start = 0; - int ii = 0; - for(auto& sz : _task_sizes_test) - { - if(sz > 0) - { - for(int ff = 0; ff < feats.size(); ++ff) - std::copy_n(feats[ff]->test_value().data() + start, sz, _D_test.data() + ff * sz); - - std::fill_n(_D_test.data() + feats.size() * sz, sz, 1.0); - dgemv_('N', sz, _n_dim + 1 - _fix_intercept, 1.0, _D_test.data(), sz, _coefs[ii].data(), 1, 0.0, _prop_test_est.data() + start, 1); - std::transform(_prop_test_est.begin() + start, _prop_test_est.begin() + start + sz, _prop_test.data() + start, _test_error.data() + start, std::minus<double>()); - } - ++ii; - start += sz; - } -} - -Model::Model(std::string train_file) -{ - _n_samp_test = 0; - - std::vector<std::string> split_str; - std::vector<std::string> feature_expr_train = populate_model(train_file, true); - - for(int ff = 0; ff < feature_expr_train.size(); ++ff) - { - split_str = str_utils::split_string_trim(feature_expr_train[ff]); - int rung = std::stoi(split_str[0]); - std::string unit_str = split_str[1]; - std::string postfix_expr = split_str[2]; - std::string expr = split_str[3]; - - std::vector<double> feat_val(_n_samp_train); - std::vector<double> feat_test_val = {}; - std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data()); - - model_node_ptr feat = std::make_shared<ModelNode>(ff, rung, expr, postfix_expr, feat_val, feat_test_val, Unit(unit_str)); - _feats.push_back(feat); - } - -} - -Model::Model(std::string train_file, std::string test_file) -{ - std::vector<std::string> split_str; - std::vector<std::string> feature_expr_train = populate_model(train_file, true); - std::vector<std::string> feature_expr_test = populate_model(test_file, false); - - for(int ff = 0; ff < feature_expr_train.size(); ++ff) - { - if(feature_expr_train[ff] != feature_expr_test[ff]) - throw std::logic_error("Features for train and test file do not agree"); - - split_str = str_utils::split_string_trim(feature_expr_train[ff]); - - int rung = std::stoi(split_str[0]); - std::string unit_str = split_str[1]; - std::string postfix_expr = split_str[2]; - std::string expr = split_str[3]; - - std::vector<double> feat_val(_n_samp_train); - std::vector<double> feat_test_val(_n_samp_test); - - std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data()); - std::copy_n(&_D_test[ff * _n_samp_test], _n_samp_test, feat_test_val.data()); - - _feats.push_back(std::make_shared<ModelNode>(ff, rung, expr, postfix_expr, feat_val, feat_test_val, Unit(unit_str))); - } -} - -std::vector<std::string> Model::populate_model(std::string filename, bool train) -{ - std::ifstream file_stream; - file_stream.open(filename, std::ios::in); - - std::vector<std::string> feature_expr; - std::vector<std::string> split_line; - - // Store model line - std::string model_line; - std::getline(file_stream, model_line); - - // Get the property unit and error - std::string unit_line; - std::string error_line; - - std::getline(file_stream, unit_line); - if(unit_line.substr(0, 8).compare("# RMSE: ") != 0) - { - split_line = str_utils::split_string_trim(unit_line); - _prop_unit = Unit(split_line.back()); - std::getline(file_stream, error_line); - } - else - { - _prop_unit = Unit(); - error_line = unit_line; - } - - split_line = str_utils::split_string_trim(error_line); - - double rmse = std::stod(split_line[1]); - double max_ae = std::stod(split_line[3]); - - // Get coefficients - std::string line; - std::getline(file_stream, line); - std::getline(file_stream, line); - - int n_task = 0; - int n_dim = 0; - std::getline(file_stream, line); - do - { - ++n_task; - split_line = str_utils::split_string_trim(line); - n_dim = split_line.size() - 3; - if(train) - { - _coefs.push_back(std::vector<double>(n_dim + 1, 0.0)); - std::transform(split_line.begin() + 1, split_line.end()-1, _coefs.back().data(), [](std::string s){return std::stod(s);}); - } - std::getline(file_stream, line); - } while(line.substr(0, 39).compare("# Feature Rung, Units, and Expressions") != 0); - - if(_coefs.back().back() == 0.0) - _fix_intercept = true; - else - _fix_intercept = false; - - _n_dim = n_dim + 1 - _fix_intercept; - - std::getline(file_stream, line); - for(int ff = 0; ff < n_dim; ++ff) - { - feature_expr.push_back(line.substr(6)); - std::getline(file_stream, line); - } - - std::getline(file_stream, line); - - int n_samp = 0; - for(int tt = 0; tt < n_task; ++tt) - { - std::getline(file_stream, line); - split_line = str_utils::split_string_trim(line); - n_samp += std::stoi(split_line[1]); - if(train) - _task_sizes_train.push_back(std::stoi(split_line[1])); - else - _task_sizes_test.push_back(std::stoi(split_line[1])); - } - - if(train) - { - _n_samp_train = n_samp; - _prop_train.resize(n_samp); - _prop_train_est.resize(n_samp); - _train_error.resize(n_samp); - } - else - { - _n_samp_test = n_samp; - _prop_test.resize(n_samp); - _prop_test_est.resize(n_samp); - _test_error.resize(n_samp); - } - - std::getline(file_stream, line); - std::getline(file_stream, line); - if(!train) - std::getline(file_stream, line); - - std::vector<std::vector<double>> feat_vals(n_dim, std::vector<double>(n_samp, 0.0)); - for(int ns = 0; ns < n_samp; ++ns) - { - std::getline(file_stream, line); - split_line = str_utils::split_string_trim(line); - if(train) - { - _prop_train[ns] = std::stod(split_line[0]); - _prop_train_est[ns] = std::stod(split_line[1]); - _train_error[ns] = _prop_train_est[ns] - _prop_train[ns]; - - } - else - { - _prop_test[ns] = std::stod(split_line[0]); - _prop_test_est[ns] = std::stod(split_line[1]); - _test_error[ns] = _prop_test_est[ns] - _prop_test[ns]; - } - for(int nf = 0; nf < n_dim; ++nf) - { - feat_vals[nf][ns] = std::stod(split_line[2 + nf]); - } - } - - if(train) - { - _D_train.resize(_n_dim * n_samp); - for(int nf = 0; nf < n_dim; ++nf) - std::copy_n(feat_vals[nf].data(), n_samp, &_D_train[nf * n_samp]); - } - else - { - _D_test.resize(_n_dim * n_samp); - for(int nf = 0; nf < n_dim; ++nf) - std::copy_n(feat_vals[nf].data(), n_samp, &_D_test[nf * n_samp]); - } - - file_stream.close(); - return feature_expr; -} - -std::string Model::toString() const -{ - std::stringstream unit_rep; - unit_rep << "c0"; - for(int ff = 0; ff < _feats.size(); ++ff) - unit_rep << " + a" << std::to_string(ff) << " * " << _feats[ff]->expr(); - return unit_rep.str(); -} - -std::ostream& operator<< (std::ostream& outStream, const Model& model) -{ - outStream << model.toString(); - return outStream; -} - -void Model::to_file(std::string filename, bool train, std::vector<int> test_inds) -{ - boost::filesystem::path p(filename.c_str()); - boost::filesystem::create_directories(p.remove_filename()); - - std::ofstream out_file_stream = std::ofstream(); - out_file_stream.open(filename); - - out_file_stream << "# " << toString() << std::endl; - out_file_stream << "# Property of the Unit: " << _prop_unit.toString() << std::endl; - if(train) - out_file_stream << "# RMSE: " << std::setprecision(15) << rmse() << "; Max AE: " << max_ae() << std::endl; - else - out_file_stream << "# RMSE: " << std::setprecision(15) << test_rmse() << "; Max AE: " << test_max_ae() << std::endl; - - out_file_stream << "# Coefficients" << std::endl; - out_file_stream << std::setw(10) << std::left << "# Task;"; - - for(int cc = 0; cc < _coefs[0].size() - (1 - _fix_intercept); ++cc) - out_file_stream << std::setw(24) << " a" + std::to_string(cc); - - if(!_fix_intercept) - out_file_stream << " c0" << std::endl; - else - out_file_stream << std::endl; - - for(int cc = 0; cc < _coefs.size(); ++cc) - { - out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc) + ", "; - for(auto& coeff : _coefs[cc]) - out_file_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", "; - out_file_stream << "\n"; - } - - out_file_stream << "# Feature Rung, Units, and Expressions" << std::endl; - for(int ff = 0; ff < _feats.size(); ++ff) - out_file_stream << std::setw(6) << std::left << "# " + std::to_string(ff) + ", " << std::to_string(_feats[ff]->rung()) + ", " << std::setw(50) << _feats[ff]->unit().toString() + ", " << _feats[ff]->postfix_expr() + "," << _feats[ff]->expr() << std::endl; - - out_file_stream << "# Number of Samples Per Task" << std::endl; - if(train) - { - out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_train" << std::endl; - for(int tt = 0; tt < _task_sizes_train.size(); ++tt) - out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", " << std::left << std::setw(22) << _task_sizes_train[tt] << std::endl; - } - else - { - out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_test" << std::endl; - for(int tt = 0; tt < _task_sizes_test.size(); ++tt) - out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", " << std::left << std::setw(22) << _task_sizes_test[tt] << std::endl; - - out_file_stream << "# Test Indexes: [ " << test_inds[0]; - for(int ii = 1; ii < test_inds.size(); ++ii) - out_file_stream << ", " << test_inds[ii]; - out_file_stream << " ]" << std::endl; - } - - out_file_stream << "\n" << std::setw(24) << std::left << "#Property Value" << std::setw(24) << " Property Value (EST)"; - for(int ff = 0; ff < _feats.size(); ++ff) - out_file_stream << std::setw(24) << " Feature " + std::to_string(ff) + " Value"; - out_file_stream << std::endl; - - if(train) - { - for(int ss = 0; ss < _n_samp_train; ++ss) - { - out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_train[ss] << std::setw(2) << ", " << std::setw(22) << _prop_train_est[ss]; - for(int ff = 0; ff < _n_dim; ++ff) - out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->value()[ss]; - out_file_stream << std::endl; - } - } - else - { - for(int ss = 0; ss < _n_samp_test; ++ss) - { - out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_test[ss] << std::setw(2) << ", " << std::setw(22) << _prop_test_est[ss]; - for(int ff = 0; ff < _n_dim; ++ff) - out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->test_value()[ss]; - out_file_stream << std::endl; - } - } - out_file_stream.close(); -} - -std::vector<double> Model::sorted_error() -{ - std::vector<double> sorted_error(_train_error.size(), 0.0); - std::copy_n(_train_error.data(), _train_error.size(), sorted_error.data()); - std::transform(sorted_error.begin(), sorted_error.end(), sorted_error.begin(), [](double e){return std::abs(e);}); - std::sort(sorted_error.begin(), sorted_error.end()); - return sorted_error; -} - -std::vector<double> Model::sorted_test_error() -{ - std::vector<double> sorted_error(_test_error.size(), 0.0); - std::copy_n(_test_error.data(), _test_error.size(), sorted_error.data()); - std::transform(sorted_error.begin(), sorted_error.end(), sorted_error.begin(), [](double e){return std::abs(e);}); - std::sort(sorted_error.begin(), sorted_error.end()); - return sorted_error; -} - -double Model::mape() -{ - std::vector<double> percent_error(_train_error.size(), 0.0); - std::transform(_train_error.begin(), _train_error.end(), _prop_train.begin(), percent_error.begin(), [](double e, double p){return std::abs(e / p);}); - return util_funcs::mean(percent_error); -} - -double Model::test_mape() -{ - std::vector<double> percent_error(_test_error.size(), 0.0); - std::transform(_test_error.begin(), _test_error.end(), _prop_test.begin(), percent_error.begin(), [](double e, double p){return std::abs(e / p);}); - return util_funcs::mean(percent_error); -} +Model::Model() +{} diff --git a/src/descriptor_identifier/Model/Model.hpp b/src/descriptor_identifier/Model/Model.hpp index b07228be43df5b2af2f66d7bd1105cf69959bb8f..36b1c1318a9b8aa7447741177f4ed17644be6b9e 100644 --- a/src/descriptor_identifier/Model/Model.hpp +++ b/src/descriptor_identifier/Model/Model.hpp @@ -35,6 +35,7 @@ typedef std::shared_ptr<ModelNode> model_node_ptr; */ class Model { +protected: int _n_samp_train; //!< The number of samples per feature int _n_samp_test; //!< The number of test samples per feature int _n_dim; //!< Dimension of the model @@ -44,14 +45,9 @@ class Model std::vector<std::vector<double>> _coefs; //!< Coefficients for the features std::vector<double> _prop_train; //!< The property to be modeled (training data) std::vector<double> _prop_test; //!< The property to be modeled (testing data) - std::vector<double> _train_error; //!< The error of the model (training) - std::vector<double> _test_error; //!< The error of the model (testing) std::vector<double> _D_train; //!< The Descriptor matrix (training data) std::vector<double> _D_test; //!< The Descriptor matrix (testing data) - std::vector<double> _prop_train_est; //!< The estimated Property (training data) - std::vector<double> _prop_test_est; //!< The estimated Property (testing data) - std::vector<int> _task_sizes_train; //!< Number of training samples in each task std::vector<int> _task_sizes_test; //!< Number of testing samples in each task @@ -59,6 +55,12 @@ class Model bool _fix_intercept; //!< If true fix intercept to 0 public: + // DocString: model_init + /** + * @brief Base constructer for a model + */ + Model(); + /** * @brief Constructor for the model * @@ -71,72 +73,33 @@ public: */ Model(Unit prop_unit, std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept); - // DocString: model_init_str /** - * @brief Construct a model from a training output file - * @details Reads in all of the data from the output file and recreates the model object + * @brief Copy Constructor * - * @param train_file Previously generated model file + * @param o Model to be copied */ - Model(std::string train_file); + Model(const Model& o) = default; - // DocString: model_init_str_str /** - * @brief Construct a model from a training and testing output file - * @details Reads in all of the data from the output files and recreates the model object + * @brief Move Constructor * - * @param train_file Previously generated training model output file - * @param train_file Previously generated testing model output file + * @param o Model to be moved */ - Model(std::string train_file, std::string test_file); + Model(Model&& o) = default; /** - * @brief Read an output file and extract all relevant information - * @details Takes in an output file and extracts all data needed to recreate the model - * - * @param filename Name of the output file - * @param train If true then the output represents training data + * @brief Copy Assignment operator * - * @return [description] + * @param o Model to be copied */ - std::vector<std::string> populate_model(std::string filename, bool train); + Model& operator= (const Model& o) = default; - // DocString: model_str /** - * @brief Convert the model to a string - - * @return The string representation of the model - */ - std::string toString() const; - - /** - * @brief The estimated property values - */ - inline std::vector<double> predict(){return _prop_test_est;} - - /** - * @brief Accessor function to predict property values - */ - inline std::vector<double> predict_train(){return _prop_train_est;} - - /** - * @brief Copy the error into a new array + * @brief Move Assignment operator * - * @param res pointer to the beginning of the vector to store the residual - */ - inline void copy_error(double* res){std::copy_n(_train_error.data(), _n_samp_train, res);} - - // DocString: model_rmse - /** - * @brief The training rmse of the model - */ - inline double rmse(){return util_funcs::norm(_train_error.data(), _n_samp_train) / std::sqrt(static_cast<double>(_n_samp_train));} - - // DocString: model_test_rmse - /** - * @brief The test rmse of the model + * @param o Model to be moved */ - inline double test_rmse(){return util_funcs::norm(_test_error.data(), _n_samp_test) / std::sqrt(static_cast<double>(_n_samp_test));} + Model& operator= (Model&& o) = default; // DocString: model_n_samp_train /** @@ -156,120 +119,6 @@ public: */ inline int n_dim(){return _n_dim;} - // DocString: model_max_ae - /** - * @brief The max Absolute error of the training data - */ - inline double max_ae() - { - return std::abs(*std::max_element(_train_error.data(), _train_error.data() + _n_samp_train, [](double d1, double d2){return std::abs(d1) < std::abs(d2);})); - } - - // DocString: model_test_max_ae - /** - * @brief The max Absolute error of the testing data - */ - inline double test_max_ae() - { - return std::abs(*std::max_element(_test_error.data(), _test_error.data() + _n_samp_test, [](double d1, double d2){return std::abs(d1) < std::abs(d2);})); - } - - // DocString: model_mae - /** - * @brief The mean absolute error of the model - * @return The mean absolute error of the training data - */ - inline double mae(){return std::accumulate(_train_error.begin(), _train_error.end(), 0.0, [](double total, double e){return total + std::abs(e);}) / _n_samp_train;} - - // DocString: model_test_mae - /** - * @brief The mean absolute test error of the model - * @return The mean absolute error of the test data - */ - inline double test_mae(){return std::accumulate(_test_error.begin(), _test_error.end(), 0.0, [](double total, double e){return total + std::abs(e);}) / _n_samp_test;} - - // DocString: model_mape - /** - * @brief The mean absolute error of the model - * @return The mean absolute error of the training data - */ - double mape(); - - // DocString: model_test_mape - /** - * @brief The mean absolute test error of the model - * @return The mean absolute error of the test data - */ - double test_mape(); - - /** - * @brief Sort the training error based on magnitude - * @return The error vector sorted - */ - std::vector<double> sorted_error(); - - /** - * @brief Sort the training test_error based on magnitude - * @return The test_error vector sorted - */ - std::vector<double> sorted_test_error(); - - // DocString: model_percentile_25_ae - /** - * @brief The mean absolute error of the model - * @return The mean absolute error of the training data - */ - inline double percentile_25_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.25))];} - - // DocString: model_test_percentile_25_ae - /** - * @brief The mean absolute test error of the model - * @return The mean absolute error of the test data - */ - inline double percentile_25_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.25))];} - - // DocString: model_percentile_50_ae - /** - * @brief The mean absolute error of the model - * @return The mean absolute error of the training data - */ - inline double percentile_50_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.50))];} - - // DocString: model_test_percentile_50_ae - /** - * @brief The mean absolute test error of the model - * @return The mean absolute error of the test data - */ - inline double percentile_50_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.50))];} - - // DocString: model_percentile_75_ae - /** - * @brief The mean absolute error of the model - * @return The mean absolute error of the training data - */ - inline double percentile_75_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.75))];} - - // DocString: model_test_percentile_75_ae - /** - * @brief The mean absolute test error of the model - * @return The mean absolute error of the test data - */ - inline double percentile_75_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.75))];} - - // DocString: model_percentile_95_ae - /** - * @brief The mean absolute error of the model - * @return The mean absolute error of the training data - */ - inline double percentile_95_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.95))];} - - // DocString: model_test_percentile_95_ae - /** - * @brief The mean absolute test error of the model - * @return The mean absolute error of the test data - */ - inline double percentile_95_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.95))];} - // DocString: model_prop_unit /** * @brief The unit of the property @@ -284,7 +133,7 @@ public: * @param train If true output the training data * @param test_inds The indexes of the test set */ - void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {}); + virtual void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {}) = 0; #ifdef PY_BINDINGS // DocString: model_coefs @@ -313,20 +162,6 @@ public: return feat_lst; } - // DocString: model_prop_train_est - /** - * @brief The estimation of the property - * @return _prop_train_est as a numpy ndarray - */ - inline np::ndarray prop_train_est(){return python_conv_utils::to_ndarray<double>(_prop_train_est);} - - // DocString: model_prop_test_est - /** - * @brief The estimation of the properties test values - * @return _prop_test_est as a numpy ndarray - */ - inline np::ndarray prop_test_est(){return python_conv_utils::to_ndarray<double>(_prop_test_est);} - // DocString: model_prop_train /** * @brief The training data property to be learned @@ -341,28 +176,7 @@ public: */ inline np::ndarray prop_test(){return python_conv_utils::to_ndarray<double>(_prop_test);} - // DocString: model_train_error - /** - * @brief The training error - * @return _train_error as a numpy ndarray - */ - inline np::ndarray train_error(){return python_conv_utils::to_ndarray<double>(_train_error);} - - // DocString: model_test_error - /** - * @brief The test error - * @return _test_error as a numpy ndarray - */ - inline np::ndarray test_error(){return python_conv_utils::to_ndarray<double>(_test_error);} #endif }; -/** - * @brief Print a model to an string stream - * - * @param outStream The output stream the model is to be printed - * @param model The model to be printed - */ -std::ostream& operator<< (std::ostream& outStream, const Model& model); - #endif \ No newline at end of file diff --git a/src/descriptor_identifier/Model/ModelClassifier.cpp b/src/descriptor_identifier/Model/ModelClassifier.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7108e9edfd0fe33e03c42284df08941f54b239ba --- /dev/null +++ b/src/descriptor_identifier/Model/ModelClassifier.cpp @@ -0,0 +1,386 @@ +#include <descriptor_identifier/Model/ModelClassifier.hpp> + +ModelClassifier::ModelClassifier(Unit prop_unit, std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept) : + Model(prop_unit, prop_train, prop_test, feats, task_sizes_train, task_sizes_test, fix_intercept), + _prop_train_est(_n_samp_train, 0.0), + _prop_test_est(_n_samp_test, 0.0), + _train_error(_n_samp_train), + _test_error(_n_samp_test) +{ + _prop_train_est.reserve(_n_samp_train); + _prop_test_est.reserve(_n_samp_test); + + std::vector<shark::LabeledData<shark::RealVector, unsigned int>> training(_task_sizes_train.size()); + std::vector<shark::LabeledData<shark::RealVector, unsigned int>> test(_task_sizes_test.size()); + std::vector<unsigned int> prop_to_copy; + int n_els = 0; + for(int tt = 0; tt < _task_sizes_train.size(); ++tt) + { + shark::Data<shark::RealVector> data(_task_sizes_train[tt], shark::RealVector(_n_dim)); + shark::Data<unsigned int> labels(_task_sizes_train[tt], 0); + + for(int ii = 0; ii < _task_sizes_train[tt]; ++ii) + prop_to_copy[ii] = static_cast<unsigned int>(_prop_train[ii + n_els]); + + int n_copied = 0; + for(auto& batch : labels.batches()) + { + std::copy_n(prop_to_copy.begin() + n_copied, shark::batchSize(batch), &batch(0)); + n_els += shark::batchSize(batch); + n_copied += shark::batchSize(batch); + } + training[tt] = shark::LabeledData<shark::RealVector, unsigned int>(data, labels); + } + + n_els = 0; + for(int tt = 0; tt < _task_sizes_test.size(); ++tt) + { + shark::Data<shark::RealVector> data(_task_sizes_test[tt], shark::RealVector(_n_dim)); + shark::Data<unsigned int> labels(_task_sizes_test[tt], 0); + + for(int ii = 0; ii < _task_sizes_test[tt]; ++ii) + prop_to_copy[ii] = static_cast<unsigned int>(_prop_test[ii + n_els]); + + int n_copied = 0; + for(auto& batch : labels.batches()) + { + std::copy_n(prop_to_copy.begin() + n_copied, shark::batchSize(batch), &batch(0)); + n_els += shark::batchSize(batch); + n_copied += shark::batchSize(batch); + } + test[tt] = shark::LabeledData<shark::RealVector, unsigned int>(data, labels); + } + + shark::LinearCSvmTrainer<shark::RealVector> trainer(1.0, !_fix_intercept); + shark::LinearClassifier<shark::RealVector> model; + shark::ZeroOneLoss<unsigned int> loss; + shark::Data<unsigned int> output; + int ii_train = 0; + int ii_test = 0; + for(int tt = 0; tt < _task_sizes_train.size(); ++tt) + { + trainer.train(model, training[tt]); + output = model(training[tt].inputs()); + for(auto element: output.elements()) + { + _prop_train_est[ii_train] = static_cast<double>(element); + ++ii_train; + } + + output = model(test[tt].inputs()); + for(auto element: output.elements()) + { + _prop_train_est[ii_test] = static_cast<double>(element); + ++ii_test; + } + if(_fix_intercept) + { + _coefs.push_back(std::vector<double>(_n_dim, 0.0)); + std::copy_n(model.parameterVector().begin(), _n_dim, _coefs.back().begin()); + } + else + { + _coefs.push_back(std::vector<double>(_n_dim + 1, model.bias()(0))); + std::copy_n(model.parameterVector().begin(), _n_dim, _coefs.back().begin()); + } + } + std::transform(_prop_train_est.begin(), _prop_train_est.end(), _prop_train.begin(), _train_error.begin(), [](double est, double real){return (std::abs(est - real) < 1e-10) ? -1 : real;}); + std::transform(_prop_test_est.begin(), _prop_test_est.end(), _prop_test.begin(), _test_error.begin(), [](double est, double real){return (std::abs(est - real) < 1e-10) ? -1 : real;}); +} + +ModelClassifier::ModelClassifier(std::string train_file) +{ + _n_samp_test = 0; + + std::vector<std::string> split_str; + std::vector<std::string> feature_expr_train = populate_model(train_file, true); + + for(int ff = 0; ff < feature_expr_train.size(); ++ff) + { + split_str = str_utils::split_string_trim(feature_expr_train[ff]); + int rung = std::stoi(split_str[0]); + std::string unit_str = split_str[1]; + std::string postfix_expr = split_str[2]; + std::string expr = split_str[3]; + + std::vector<double> feat_val(_n_samp_train); + std::vector<double> feat_test_val = {}; + std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data()); + + model_node_ptr feat = std::make_shared<ModelNode>(ff, rung, expr, postfix_expr, feat_val, feat_test_val, Unit(unit_str)); + _feats.push_back(feat); + } + +} + +ModelClassifier::ModelClassifier(std::string train_file, std::string test_file) +{ + std::vector<std::string> split_str; + std::vector<std::string> feature_expr_train = populate_model(train_file, true); + std::vector<std::string> feature_expr_test = populate_model(test_file, false); + + for(int ff = 0; ff < feature_expr_train.size(); ++ff) + { + if(feature_expr_train[ff] != feature_expr_test[ff]) + throw std::logic_error("Features for train and test file do not agree"); + + split_str = str_utils::split_string_trim(feature_expr_train[ff]); + + int rung = std::stoi(split_str[0]); + std::string unit_str = split_str[1]; + std::string postfix_expr = split_str[2]; + std::string expr = split_str[3]; + + std::vector<double> feat_val(_n_samp_train); + std::vector<double> feat_test_val(_n_samp_test); + + std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data()); + std::copy_n(&_D_test[ff * _n_samp_test], _n_samp_test, feat_test_val.data()); + + _feats.push_back(std::make_shared<ModelNode>(ff, rung, expr, postfix_expr, feat_val, feat_test_val, Unit(unit_str))); + } +} + +std::vector<std::string> ModelClassifier::populate_model(std::string filename, bool train) +{ + std::ifstream file_stream; + file_stream.open(filename, std::ios::in); + + std::vector<std::string> feature_expr; + std::vector<std::string> split_line; + + // Store model line + std::string model_line; + std::getline(file_stream, model_line); + + // Get the property unit and error + std::string unit_line; + std::string error_line; + + std::getline(file_stream, unit_line); + if(unit_line.substr(0, 16).compare("# Percent Error:") != 0) + { + split_line = str_utils::split_string_trim(unit_line); + _prop_unit = Unit(split_line.back()); + std::getline(file_stream, error_line); + } + else + { + _prop_unit = Unit(); + error_line = unit_line; + } + + split_line = str_utils::split_string_trim(error_line); + + double percent_error = std::stod(split_line[1]); + + // Get coefficients + std::string line; + std::getline(file_stream, line); + std::getline(file_stream, line); + + int n_task = 0; + int n_dim = 0; + std::getline(file_stream, line); + do + { + ++n_task; + split_line = str_utils::split_string_trim(line); + n_dim = split_line.size() - 3; + if(train) + { + _coefs.push_back(std::vector<double>(n_dim + 1, 0.0)); + std::transform(split_line.begin() + 1, split_line.end()-1, _coefs.back().data(), [](std::string s){return std::stod(s);}); + } + std::getline(file_stream, line); + } while(line.substr(0, 39).compare("# Feature Rung, Units, and Expressions") != 0); + + if(_coefs.back().back() == 0.0) + _fix_intercept = true; + else + _fix_intercept = false; + + _n_dim = n_dim + 1 - _fix_intercept; + + std::getline(file_stream, line); + for(int ff = 0; ff < n_dim; ++ff) + { + feature_expr.push_back(line.substr(6)); + std::getline(file_stream, line); + } + + std::getline(file_stream, line); + + int n_samp = 0; + for(int tt = 0; tt < n_task; ++tt) + { + std::getline(file_stream, line); + split_line = str_utils::split_string_trim(line); + n_samp += std::stoi(split_line[1]); + if(train) + _task_sizes_train.push_back(std::stoi(split_line[1])); + else + _task_sizes_test.push_back(std::stoi(split_line[1])); + } + + if(train) + { + _n_samp_train = n_samp; + _prop_train.resize(n_samp); + _prop_train_est.resize(n_samp); + _train_error.resize(n_samp); + } + else + { + _n_samp_test = n_samp; + _prop_test.resize(n_samp); + _prop_test_est.resize(n_samp); + _test_error.resize(n_samp); + } + + std::getline(file_stream, line); + std::getline(file_stream, line); + if(!train) + std::getline(file_stream, line); + + std::vector<std::vector<double>> feat_vals(n_dim, std::vector<double>(n_samp, 0.0)); + for(int ns = 0; ns < n_samp; ++ns) + { + std::getline(file_stream, line); + split_line = str_utils::split_string_trim(line); + if(train) + { + _prop_train[ns] = std::stod(split_line[0]); + _prop_train_est[ns] = std::stod(split_line[1]); + _train_error[ns] = _prop_train_est[ns] - _prop_train[ns]; + + } + else + { + _prop_test[ns] = std::stod(split_line[0]); + _prop_test_est[ns] = std::stod(split_line[1]); + _test_error[ns] = _prop_test_est[ns] - _prop_test[ns]; + } + for(int nf = 0; nf < n_dim; ++nf) + { + feat_vals[nf][ns] = std::stod(split_line[2 + nf]); + } + } + + if(train) + { + _D_train.resize(_n_dim * n_samp); + for(int nf = 0; nf < n_dim; ++nf) + std::copy_n(feat_vals[nf].data(), n_samp, &_D_train[nf * n_samp]); + } + else + { + _D_test.resize(_n_dim * n_samp); + for(int nf = 0; nf < n_dim; ++nf) + std::copy_n(feat_vals[nf].data(), n_samp, &_D_test[nf * n_samp]); + } + + file_stream.close(); + return feature_expr; +} + +std::string ModelClassifier::toString() const +{ + std::stringstream unit_rep; + unit_rep << "[" << _feats[0]->expr(); + for(int ff = 1; ff < _feats.size(); ++ff) + unit_rep << ", " << _feats[ff]->expr(); + unit_rep << "]"; + return unit_rep.str(); +} + +std::ostream& operator<< (std::ostream& outStream, const ModelClassifier& model) +{ + outStream << model.toString(); + return outStream; +} + +void ModelClassifier::to_file(std::string filename, bool train, std::vector<int> test_inds) +{ + boost::filesystem::path p(filename.c_str()); + boost::filesystem::create_directories(p.remove_filename()); + + std::ofstream out_file_stream = std::ofstream(); + out_file_stream.open(filename); + + out_file_stream << "# " << toString() << std::endl; + out_file_stream << "# Property of the Unit: " << _prop_unit.toString() << std::endl; + if(train) + out_file_stream << "# Percent Error: " << std::setprecision(15) << percent_train_error() << std::endl; + else + out_file_stream << "# Percent Error: " << std::setprecision(15) << percent_test_error() << std::endl; + + out_file_stream << "# Plane Divider" << std::endl; + out_file_stream << std::setw(10) << std::left << "# Task"; + + for(int cc = 0; cc < _coefs[0].size() - (1 - _fix_intercept); ++cc) + out_file_stream << std::setw(24) << " w" + std::to_string(cc); + + if(!_fix_intercept) + out_file_stream << " b" << std::endl; + else + out_file_stream << std::endl; + + for(int cc = 0; cc < _coefs.size(); ++cc) + { + out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc) + ", "; + for(auto& coeff : _coefs[cc]) + out_file_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", "; + out_file_stream << "\n"; + } + + out_file_stream << "# Feature Rung, Units, and Expressions" << std::endl; + for(int ff = 0; ff < _feats.size(); ++ff) + out_file_stream << std::setw(6) << std::left << "# " + std::to_string(ff) + ", " << std::to_string(_feats[ff]->rung()) + ", " << std::setw(50) << _feats[ff]->unit().toString() + ", " << _feats[ff]->postfix_expr() + "," << _feats[ff]->expr() << std::endl; + + out_file_stream << "# Number of Samples Per Task" << std::endl; + if(train) + { + out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_train" << std::endl; + for(int tt = 0; tt < _task_sizes_train.size(); ++tt) + out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", " << std::left << std::setw(22) << _task_sizes_train[tt] << std::endl; + } + else + { + out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_test" << std::endl; + for(int tt = 0; tt < _task_sizes_test.size(); ++tt) + out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", " << std::left << std::setw(22) << _task_sizes_test[tt] << std::endl; + + out_file_stream << "# Test Indexes: [ " << test_inds[0]; + for(int ii = 1; ii < test_inds.size(); ++ii) + out_file_stream << ", " << test_inds[ii]; + out_file_stream << " ]" << std::endl; + } + + out_file_stream << "\n" << std::setw(24) << std::left << "#Property Value" << std::setw(24) << " Property Value (EST)"; + for(int ff = 0; ff < _feats.size(); ++ff) + out_file_stream << std::setw(24) << " Feature " + std::to_string(ff) + " Value"; + out_file_stream << std::endl; + + if(train) + { + for(int ss = 0; ss < _n_samp_train; ++ss) + { + out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_train[ss] << std::setw(2) << ", " << std::setw(22) << _prop_train_est[ss]; + for(int ff = 0; ff < _n_dim; ++ff) + out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->value()[ss]; + out_file_stream << std::endl; + } + } + else + { + for(int ss = 0; ss < _n_samp_test; ++ss) + { + out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_test[ss] << std::setw(2) << ", " << std::setw(22) << _prop_test_est[ss]; + for(int ff = 0; ff < _n_dim; ++ff) + out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->test_value()[ss]; + out_file_stream << std::endl; + } + } + out_file_stream.close(); +} diff --git a/src/descriptor_identifier/Model/ModelClassifier.hpp b/src/descriptor_identifier/Model/ModelClassifier.hpp new file mode 100644 index 0000000000000000000000000000000000000000..889b8945775dbed84b5285355e809f2852ff1df8 --- /dev/null +++ b/src/descriptor_identifier/Model/ModelClassifier.hpp @@ -0,0 +1,197 @@ +/** @file descriptor_identifier/ModelClassifier/ModelClassifier.hpp + * @brief Object to store the models generated form SISSO + * + * Creates a ModelClassifier generated from SISSO and the corresponding output file. + * It also has functionality to read in an output file to regenerate the model. + * + * @author Thomas A. R. Purcell (tpurcell) + * @bug No known bugs. + */ +#ifndef MODEL_CLASSIFIER +#define MODEL_CLASSIFIER + +#include <boost/algorithm/string.hpp> +#include <boost/algorithm/string/trim.hpp> +#include <boost/filesystem.hpp> + +#include<iomanip> +#include<fstream> +#include<iostream> + +#include <shark/Data/Dataset.h> +#include <shark/ObjectiveFunctions/Loss/ZeroOneLoss.h> +#include <shark/Algorithms/Trainers/CSvmTrainer.h> + +#include <descriptor_identifier/Model/Model.hpp> +#include <feature_creation/node/ModelNode.hpp> +#include <utils/string_utils.hpp> + +#ifdef PY_BINDINGS + namespace np = boost::python::numpy; + namespace py = boost::python; +#endif + +typedef std::shared_ptr<ModelNode> model_node_ptr; + +// DocString: cls_model_class +/** + * @brief Class to store the models found from SISSO + * + */ +class ModelClassifier : public Model +{ + std::vector<double> _prop_train_est; //!< The estimated Property (training data) + std::vector<double> _prop_test_est; //!< The estimated Property (testing data) + + std::vector<double> _train_error; //!< The error of the model (training) + std::vector<double> _test_error; //!< The error of the model (testing) + +public: + /** + * @brief Constructor for the model + * + * @param prop_unit The unit of the property + * @param prop_train The property vector for the training samples + * @param prop_test The property vector for the test samples + * @param feats The features for the model + * @param task_sizes_train Number of samples per task in the training data + * @param task_sizes_test Number of samples per task in the test data + */ + ModelClassifier(Unit prop_unit, std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept); + + // DocString: model_class_init_str + /** + * @brief Construct a model from a training output file + * @details Reads in all of the data from the output file and recreates the model object + * + * @param train_file Previously generated model file + */ + ModelClassifier(std::string train_file); + + // DocString: model_class_init_str_str + /** + * @brief Construct a model from a training and testing output file + * @details Reads in all of the data from the output files and recreates the model object + * + * @param train_file Previously generated training model output file + * @param train_file Previously generated testing model output file + */ + ModelClassifier(std::string train_file, std::string test_file); + + + /** + * @brief Copy Constructor + * + * @param o ModelClassifier to be copied + */ + ModelClassifier(const ModelClassifier& o) = default; + + /** + * @brief Move Constructor + * + * @param o ModelClassifier to be moved + */ + ModelClassifier(ModelClassifier&& o) = default; + + /** + * @brief Copy Assignment operator + * + * @param o ModelClassifier to be copied + */ + ModelClassifier& operator= (const ModelClassifier& o) = default; + + /** + * @brief Move Assignment operator + * + * @param o ModelClassifier to be moved + */ + ModelClassifier& operator= (ModelClassifier&& o) = default; + + /** + * @brief Read an output file and extract all relevant information + * @details Takes in an output file and extracts all data needed to recreate the model + * + * @param filename Name of the output file + * @param train If true then the output represents training data + * + * @return [description] + */ + std::vector<std::string> populate_model(std::string filename, bool train); + + // DocString: model_class_str + /** + * @brief Convert the model to a string + + * @return The string representation of the model + */ + std::string toString() const; + + /** + * @brief Copy the error into a new array + * + * @param res pointer to the beginning of the vector to store the residual + */ + inline void copy_error(double* res){std::copy_n(_train_error.data(), _n_samp_train, res);} + + /** + * @brief Convert the ModelClassifier into an output file + * + * @param filename The name of the file to output to + * @param train If true output the training data + * @param test_inds The indexes of the test set + */ + void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {}); + + // DocString: model_class_precent_train_error + /** + * @brief Percent of all training samples misclassified + */ + inline double percent_train_error(){100.0 * std::count_if(_train_error.begin(), _train_error.end(), [](double te){return te >= 0;}) / _n_samp_train; } + + // DocString: model_class_precent_test_error + /** + * @brief Percent of all tset samples misclassified + */ + inline double percent_test_error(){100.0 * std::count_if(_test_error.begin(), _test_error.end(), [](double te){return te >= 0;}) / _n_samp_test; } + + #ifdef PY_BINDINGS + + // DocString: model_class_prop_train_est + /** + * @brief The estimation of the property + * @return _prop_train_est as a numpy ndarray + */ + inline np::ndarray prop_train_est(){return python_conv_utils::to_ndarray<double>(_prop_train_est);} + + // DocString: model_class_prop_test_est + /** + * @brief The estimation of the properties test values + * @return _prop_test_est as a numpy ndarray + */ + inline np::ndarray prop_test_est(){return python_conv_utils::to_ndarray<double>(_prop_test_est);} + + // DocString: model_class_train_error + /** + * @brief The training error + * @return _train_error as a numpy ndarray + */ + inline np::ndarray train_error(){return python_conv_utils::to_ndarray<double>(_train_error);} + + // DocString: model_class_test_error + /** + * @brief The test error + * @return _test_error as a numpy ndarray + */ + inline np::ndarray test_error(){return python_conv_utils::to_ndarray<double>(_test_error);} + #endif +}; + +/** + * @brief Print a model to an string stream + * + * @param outStream The output stream the model is to be printed + * @param model The model to be printed + */ +std::ostream& operator<< (std::ostream& outStream, const ModelClassifier& model); + +#endif \ No newline at end of file diff --git a/src/descriptor_identifier/Model/ModelRegressor.cpp b/src/descriptor_identifier/Model/ModelRegressor.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3e5116db514f0f41aff32076935a846c85e1050e --- /dev/null +++ b/src/descriptor_identifier/Model/ModelRegressor.cpp @@ -0,0 +1,387 @@ +#include <descriptor_identifier/Model/ModelRegressor.hpp> + +ModelRegressor::ModelRegressor(Unit prop_unit, std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept) : + Model(prop_unit, prop_train, prop_test, feats, task_sizes_train, task_sizes_test, fix_intercept), + _prop_train_est(_n_samp_train, 0.0), + _prop_test_est(_n_samp_test, 0.0), + _train_error(_n_samp_train), + _test_error(_n_samp_test) +{ + _prop_train_est.reserve(_n_samp_train); + _prop_test_est.reserve(_n_samp_test); + + std::vector<double> a(_D_train.size(), 1.0); + std::vector<double> work(_D_train.size(), 0.0); + + int info; + int start = 0; + + std::copy_n(_D_train.begin(), _D_train.size(), a.begin()); + for(auto& sz : _task_sizes_train) + { + std::fill_n(a.data(), a.size(), 1.0); + std::fill_n(_D_train.data(), _D_train.size(), 1.0); + + for(int ff = 0; ff < feats.size(); ++ff) + { + std::copy_n(feats[ff]->value_ptr() + start, sz, _D_train.data() + ff * sz); + std::copy_n(feats[ff]->value_ptr() + start, sz, a.data() + ff * sz); + } + + dgels_('N', sz, _n_dim + 1 - _fix_intercept, 1, a.data(), sz, prop_train.data() + start, sz, work.data(), work.size(), &info); + + _coefs.push_back(std::vector<double>(_n_dim + (!fix_intercept), 0.0)); + + std::copy_n(prop_train.begin() + start, _n_dim + 1 - _fix_intercept, _coefs.back().data()); + dgemv_('N', sz, _n_dim + 1 - _fix_intercept, 1.0, _D_train.data(), sz, _coefs.back().data(), 1, 0.0, _prop_train_est.data() + start, 1); + std::transform(_prop_train_est.begin() + start, _prop_train_est.begin() + start + sz, _prop_train.data() + start, _train_error.data() + start, std::minus<double>()); + + start += sz; + } + + start = 0; + int ii = 0; + for(auto& sz : _task_sizes_test) + { + if(sz > 0) + { + for(int ff = 0; ff < feats.size(); ++ff) + std::copy_n(feats[ff]->test_value().data() + start, sz, _D_test.data() + ff * sz); + + std::fill_n(_D_test.data() + feats.size() * sz, sz, 1.0); + dgemv_('N', sz, _n_dim + 1 - _fix_intercept, 1.0, _D_test.data(), sz, _coefs[ii].data(), 1, 0.0, _prop_test_est.data() + start, 1); + std::transform(_prop_test_est.begin() + start, _prop_test_est.begin() + start + sz, _prop_test.data() + start, _test_error.data() + start, std::minus<double>()); + } + ++ii; + start += sz; + } +} + +ModelRegressor::ModelRegressor(std::string train_file) +{ + _n_samp_test = 0; + + std::vector<std::string> split_str; + std::vector<std::string> feature_expr_train = populate_model(train_file, true); + + for(int ff = 0; ff < feature_expr_train.size(); ++ff) + { + split_str = str_utils::split_string_trim(feature_expr_train[ff]); + int rung = std::stoi(split_str[0]); + std::string unit_str = split_str[1]; + std::string postfix_expr = split_str[2]; + std::string expr = split_str[3]; + + std::vector<double> feat_val(_n_samp_train); + std::vector<double> feat_test_val = {}; + std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data()); + + model_node_ptr feat = std::make_shared<ModelNode>(ff, rung, expr, postfix_expr, feat_val, feat_test_val, Unit(unit_str)); + _feats.push_back(feat); + } + +} + +ModelRegressor::ModelRegressor(std::string train_file, std::string test_file) +{ + std::vector<std::string> split_str; + std::vector<std::string> feature_expr_train = populate_model(train_file, true); + std::vector<std::string> feature_expr_test = populate_model(test_file, false); + + for(int ff = 0; ff < feature_expr_train.size(); ++ff) + { + if(feature_expr_train[ff] != feature_expr_test[ff]) + throw std::logic_error("Features for train and test file do not agree"); + + split_str = str_utils::split_string_trim(feature_expr_train[ff]); + + int rung = std::stoi(split_str[0]); + std::string unit_str = split_str[1]; + std::string postfix_expr = split_str[2]; + std::string expr = split_str[3]; + + std::vector<double> feat_val(_n_samp_train); + std::vector<double> feat_test_val(_n_samp_test); + + std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data()); + std::copy_n(&_D_test[ff * _n_samp_test], _n_samp_test, feat_test_val.data()); + + _feats.push_back(std::make_shared<ModelNode>(ff, rung, expr, postfix_expr, feat_val, feat_test_val, Unit(unit_str))); + } +} + +std::vector<std::string> ModelRegressor::populate_model(std::string filename, bool train) +{ + std::ifstream file_stream; + file_stream.open(filename, std::ios::in); + + std::vector<std::string> feature_expr; + std::vector<std::string> split_line; + + // Store model line + std::string model_line; + std::getline(file_stream, model_line); + + // Get the property unit and error + std::string unit_line; + std::string error_line; + + std::getline(file_stream, unit_line); + if(unit_line.substr(0, 8).compare("# RMSE: ") != 0) + { + split_line = str_utils::split_string_trim(unit_line); + _prop_unit = Unit(split_line.back()); + std::getline(file_stream, error_line); + } + else + { + _prop_unit = Unit(); + error_line = unit_line; + } + + split_line = str_utils::split_string_trim(error_line); + + double rmse = std::stod(split_line[1]); + double max_ae = std::stod(split_line[3]); + + // Get coefficients + std::string line; + std::getline(file_stream, line); + std::getline(file_stream, line); + + int n_task = 0; + int n_dim = 0; + std::getline(file_stream, line); + do + { + ++n_task; + split_line = str_utils::split_string_trim(line); + n_dim = split_line.size() - 3; + if(train) + { + _coefs.push_back(std::vector<double>(n_dim + 1, 0.0)); + std::transform(split_line.begin() + 1, split_line.end()-1, _coefs.back().data(), [](std::string s){return std::stod(s);}); + } + std::getline(file_stream, line); + } while(line.substr(0, 39).compare("# Feature Rung, Units, and Expressions") != 0); + + if(_coefs.back().back() == 0.0) + _fix_intercept = true; + else + _fix_intercept = false; + + _n_dim = n_dim + 1 - _fix_intercept; + + std::getline(file_stream, line); + for(int ff = 0; ff < n_dim; ++ff) + { + feature_expr.push_back(line.substr(6)); + std::getline(file_stream, line); + } + + std::getline(file_stream, line); + + int n_samp = 0; + for(int tt = 0; tt < n_task; ++tt) + { + std::getline(file_stream, line); + split_line = str_utils::split_string_trim(line); + n_samp += std::stoi(split_line[1]); + if(train) + _task_sizes_train.push_back(std::stoi(split_line[1])); + else + _task_sizes_test.push_back(std::stoi(split_line[1])); + } + + if(train) + { + _n_samp_train = n_samp; + _prop_train.resize(n_samp); + _prop_train_est.resize(n_samp); + _train_error.resize(n_samp); + } + else + { + _n_samp_test = n_samp; + _prop_test.resize(n_samp); + _prop_test_est.resize(n_samp); + _test_error.resize(n_samp); + } + + std::getline(file_stream, line); + std::getline(file_stream, line); + if(!train) + std::getline(file_stream, line); + + std::vector<std::vector<double>> feat_vals(n_dim, std::vector<double>(n_samp, 0.0)); + for(int ns = 0; ns < n_samp; ++ns) + { + std::getline(file_stream, line); + split_line = str_utils::split_string_trim(line); + if(train) + { + _prop_train[ns] = std::stod(split_line[0]); + _prop_train_est[ns] = std::stod(split_line[1]); + _train_error[ns] = _prop_train_est[ns] - _prop_train[ns]; + + } + else + { + _prop_test[ns] = std::stod(split_line[0]); + _prop_test_est[ns] = std::stod(split_line[1]); + _test_error[ns] = _prop_test_est[ns] - _prop_test[ns]; + } + for(int nf = 0; nf < n_dim; ++nf) + { + feat_vals[nf][ns] = std::stod(split_line[2 + nf]); + } + } + + if(train) + { + _D_train.resize(_n_dim * n_samp); + for(int nf = 0; nf < n_dim; ++nf) + std::copy_n(feat_vals[nf].data(), n_samp, &_D_train[nf * n_samp]); + } + else + { + _D_test.resize(_n_dim * n_samp); + for(int nf = 0; nf < n_dim; ++nf) + std::copy_n(feat_vals[nf].data(), n_samp, &_D_test[nf * n_samp]); + } + + file_stream.close(); + return feature_expr; +} + +std::string ModelRegressor::toString() const +{ + std::stringstream unit_rep; + unit_rep << "c0"; + for(int ff = 0; ff < _feats.size(); ++ff) + unit_rep << " + a" << std::to_string(ff) << " * " << _feats[ff]->expr(); + return unit_rep.str(); +} + +std::ostream& operator<< (std::ostream& outStream, const ModelRegressor& model) +{ + outStream << model.toString(); + return outStream; +} + +void ModelRegressor::to_file(std::string filename, bool train, std::vector<int> test_inds) +{ + boost::filesystem::path p(filename.c_str()); + boost::filesystem::create_directories(p.remove_filename()); + + std::ofstream out_file_stream = std::ofstream(); + out_file_stream.open(filename); + + out_file_stream << "# " << toString() << std::endl; + out_file_stream << "# Property of the Unit: " << _prop_unit.toString() << std::endl; + if(train) + out_file_stream << "# RMSE: " << std::setprecision(15) << rmse() << "; Max AE: " << max_ae() << std::endl; + else + out_file_stream << "# RMSE: " << std::setprecision(15) << test_rmse() << "; Max AE: " << test_max_ae() << std::endl; + + out_file_stream << "# Coefficients" << std::endl; + out_file_stream << std::setw(10) << std::left << "# Task"; + + for(int cc = 0; cc < _coefs[0].size() - (1 - _fix_intercept); ++cc) + out_file_stream << std::setw(24) << " a" + std::to_string(cc); + + if(!_fix_intercept) + out_file_stream << " c0" << std::endl; + else + out_file_stream << std::endl; + + for(int cc = 0; cc < _coefs.size(); ++cc) + { + out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc) + ", "; + for(auto& coeff : _coefs[cc]) + out_file_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", "; + out_file_stream << "\n"; + } + + out_file_stream << "# Feature Rung, Units, and Expressions" << std::endl; + for(int ff = 0; ff < _feats.size(); ++ff) + out_file_stream << std::setw(6) << std::left << "# " + std::to_string(ff) + ", " << std::to_string(_feats[ff]->rung()) + ", " << std::setw(50) << _feats[ff]->unit().toString() + ", " << _feats[ff]->postfix_expr() + "," << _feats[ff]->expr() << std::endl; + + out_file_stream << "# Number of Samples Per Task" << std::endl; + if(train) + { + out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_train" << std::endl; + for(int tt = 0; tt < _task_sizes_train.size(); ++tt) + out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", " << std::left << std::setw(22) << _task_sizes_train[tt] << std::endl; + } + else + { + out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_test" << std::endl; + for(int tt = 0; tt < _task_sizes_test.size(); ++tt) + out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", " << std::left << std::setw(22) << _task_sizes_test[tt] << std::endl; + + out_file_stream << "# Test Indexes: [ " << test_inds[0]; + for(int ii = 1; ii < test_inds.size(); ++ii) + out_file_stream << ", " << test_inds[ii]; + out_file_stream << " ]" << std::endl; + } + + out_file_stream << "\n" << std::setw(24) << std::left << "#Property Value" << std::setw(24) << " Property Value (EST)"; + for(int ff = 0; ff < _feats.size(); ++ff) + out_file_stream << std::setw(24) << " Feature " + std::to_string(ff) + " Value"; + out_file_stream << std::endl; + + if(train) + { + for(int ss = 0; ss < _n_samp_train; ++ss) + { + out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_train[ss] << std::setw(2) << ", " << std::setw(22) << _prop_train_est[ss]; + for(int ff = 0; ff < _n_dim; ++ff) + out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->value()[ss]; + out_file_stream << std::endl; + } + } + else + { + for(int ss = 0; ss < _n_samp_test; ++ss) + { + out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_test[ss] << std::setw(2) << ", " << std::setw(22) << _prop_test_est[ss]; + for(int ff = 0; ff < _n_dim; ++ff) + out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->test_value()[ss]; + out_file_stream << std::endl; + } + } + out_file_stream.close(); +} + +std::vector<double> ModelRegressor::sorted_error() +{ + std::vector<double> sorted_error(_train_error.size(), 0.0); + std::copy_n(_train_error.data(), _train_error.size(), sorted_error.data()); + std::transform(sorted_error.begin(), sorted_error.end(), sorted_error.begin(), [](double e){return std::abs(e);}); + std::sort(sorted_error.begin(), sorted_error.end()); + return sorted_error; +} + +std::vector<double> ModelRegressor::sorted_test_error() +{ + std::vector<double> sorted_error(_test_error.size(), 0.0); + std::copy_n(_test_error.data(), _test_error.size(), sorted_error.data()); + std::transform(sorted_error.begin(), sorted_error.end(), sorted_error.begin(), [](double e){return std::abs(e);}); + std::sort(sorted_error.begin(), sorted_error.end()); + return sorted_error; +} + +double ModelRegressor::mape() +{ + std::vector<double> percent_error(_train_error.size(), 0.0); + std::transform(_train_error.begin(), _train_error.end(), _prop_train.begin(), percent_error.begin(), [](double e, double p){return std::abs(e / p);}); + return util_funcs::mean(percent_error); +} + +double ModelRegressor::test_mape() +{ + std::vector<double> percent_error(_test_error.size(), 0.0); + std::transform(_test_error.begin(), _test_error.end(), _prop_test.begin(), percent_error.begin(), [](double e, double p){return std::abs(e / p);}); + return util_funcs::mean(percent_error); +} diff --git a/src/descriptor_identifier/Model/ModelRegressor.hpp b/src/descriptor_identifier/Model/ModelRegressor.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4a28d58ff0cbf51009109b3ba0cdd33023231945 --- /dev/null +++ b/src/descriptor_identifier/Model/ModelRegressor.hpp @@ -0,0 +1,305 @@ +/** @file descriptor_identifier/ModelRegressor/ModelRegressor.hpp + * @brief Object to store the models generated form SISSO + * + * Creates a ModelRegressor generated from SISSO and the corresponding output file. + * It also has functionality to read in an output file to regenerate the model. + * + * @author Thomas A. R. Purcell (tpurcell) + * @bug No known bugs. + */ +#ifndef MODEL_REGRESSOR +#define MODEL_REGRESSOR + +#include <boost/algorithm/string.hpp> +#include <boost/algorithm/string/trim.hpp> +#include <boost/filesystem.hpp> + +#include<iomanip> +#include<fstream> +#include<iostream> + +#include <descriptor_identifier/Model/Model.hpp> +#include <feature_creation/node/ModelNode.hpp> +#include <utils/string_utils.hpp> + +#ifdef PY_BINDINGS + namespace np = boost::python::numpy; + namespace py = boost::python; +#endif + +typedef std::shared_ptr<ModelNode> model_node_ptr; + +// DocString: cls_model_reg +/** + * @brief Class to store the models found from SISSO + * + */ +class ModelRegressor : public Model +{ + std::vector<double> _prop_train_est; //!< The estimated Property (training data) + std::vector<double> _prop_test_est; //!< The estimated Property (testing data) + + std::vector<double> _train_error; //!< The error of the model (training) + std::vector<double> _test_error; //!< The error of the model (testing) + +public: + /** + * @brief Constructor for the model + * + * @param prop_unit The unit of the property + * @param prop_train The property vector for the training samples + * @param prop_test The property vector for the test samples + * @param feats The features for the model + * @param task_sizes_train Number of samples per task in the training data + * @param task_sizes_test Number of samples per task in the test data + */ + ModelRegressor(Unit prop_unit, std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept); + + // DocString: model_reg_init_str + /** + * @brief Construct a model from a training output file + * @details Reads in all of the data from the output file and recreates the model object + * + * @param train_file Previously generated model file + */ + ModelRegressor(std::string train_file); + + // DocString: model_reg_init_str_str + /** + * @brief Construct a model from a training and testing output file + * @details Reads in all of the data from the output files and recreates the model object + * + * @param train_file Previously generated training model output file + * @param train_file Previously generated testing model output file + */ + ModelRegressor(std::string train_file, std::string test_file); + + /** + * @brief Copy Constructor + * + * @param o ModelRegressor to be copied + */ + ModelRegressor(const ModelRegressor& o) = default; + + /** + * @brief Move Constructor + * + * @param o ModelRegressor to be moved + */ + ModelRegressor(ModelRegressor&& o) = default; + + /** + * @brief Copy Assignment operator + * + * @param o ModelRegressor to be copied + */ + ModelRegressor& operator= (const ModelRegressor& o) = default; + + /** + * @brief Move Assignment operator + * + * @param o ModelRegressor to be moved + */ + ModelRegressor& operator= (ModelRegressor&& o) = default; + + /** + * @brief Read an output file and extract all relevant information + * @details Takes in an output file and extracts all data needed to recreate the model + * + * @param filename Name of the output file + * @param train If true then the output represents training data + * + * @return [description] + */ + std::vector<std::string> populate_model(std::string filename, bool train); + + // DocString: model_reg_str + /** + * @brief Convert the model to a string + + * @return The string representation of the model + */ + std::string toString() const; + + /** + * @brief Copy the error into a new array + * + * @param res pointer to the beginning of the vector to store the residual + */ + inline void copy_error(double* res){std::copy_n(_train_error.data(), _n_samp_train, res);} + + // DocString: model_reg_rmse + /** + * @brief The training rmse of the model + */ + inline double rmse(){return util_funcs::norm(_train_error.data(), _n_samp_train) / std::sqrt(static_cast<double>(_n_samp_train));} + + // DocString: model_reg_test_rmse + /** + * @brief The test rmse of the model + */ + inline double test_rmse(){return util_funcs::norm(_test_error.data(), _n_samp_test) / std::sqrt(static_cast<double>(_n_samp_test));} + + // DocString: model_reg_max_ae + /** + * @brief The max Absolute error of the training data + */ + inline double max_ae() + { + return std::abs(*std::max_element(_train_error.data(), _train_error.data() + _n_samp_train, [](double d1, double d2){return std::abs(d1) < std::abs(d2);})); + } + + // DocString: model_reg_test_max_ae + /** + * @brief The max Absolute error of the testing data + */ + inline double test_max_ae() + { + return std::abs(*std::max_element(_test_error.data(), _test_error.data() + _n_samp_test, [](double d1, double d2){return std::abs(d1) < std::abs(d2);})); + } + + // DocString: model_reg_mae + /** + * @brief The mean absolute error of the model + * @return The mean absolute error of the training data + */ + inline double mae(){return std::accumulate(_train_error.begin(), _train_error.end(), 0.0, [](double total, double e){return total + std::abs(e);}) / _n_samp_train;} + + // DocString: model_reg_test_mae + /** + * @brief The mean absolute test error of the model + * @return The mean absolute error of the test data + */ + inline double test_mae(){return std::accumulate(_test_error.begin(), _test_error.end(), 0.0, [](double total, double e){return total + std::abs(e);}) / _n_samp_test;} + + // DocString: model_reg_mape + /** + * @brief The mean absolute error of the model + * @return The mean absolute error of the training data + */ + double mape(); + + // DocString: model_reg_test_mape + /** + * @brief The mean absolute test error of the model + * @return The mean absolute error of the test data + */ + double test_mape(); + + /** + * @brief Sort the training error based on magnitude + * @return The error vector sorted + */ + std::vector<double> sorted_error(); + + /** + * @brief Sort the training test_error based on magnitude + * @return The test_error vector sorted + */ + std::vector<double> sorted_test_error(); + + // DocString: model_reg_percentile_25_ae + /** + * @brief The mean absolute error of the model + * @return The mean absolute error of the training data + */ + inline double percentile_25_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.25))];} + + // DocString: model_reg_test_percentile_25_ae + /** + * @brief The mean absolute test error of the model + * @return The mean absolute error of the test data + */ + inline double percentile_25_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.25))];} + + // DocString: model_reg_percentile_50_ae + /** + * @brief The mean absolute error of the model + * @return The mean absolute error of the training data + */ + inline double percentile_50_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.50))];} + + // DocString: model_reg_test_percentile_50_ae + /** + * @brief The mean absolute test error of the model + * @return The mean absolute error of the test data + */ + inline double percentile_50_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.50))];} + + // DocString: model_reg_percentile_75_ae + /** + * @brief The mean absolute error of the model + * @return The mean absolute error of the training data + */ + inline double percentile_75_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.75))];} + + // DocString: model_reg_test_percentile_75_ae + /** + * @brief The mean absolute test error of the model + * @return The mean absolute error of the test data + */ + inline double percentile_75_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.75))];} + + // DocString: model_reg_percentile_95_ae + /** + * @brief The mean absolute error of the model + * @return The mean absolute error of the training data + */ + inline double percentile_95_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.95))];} + + // DocString: model_reg_test_percentile_95_ae + /** + * @brief The mean absolute test error of the model + * @return The mean absolute error of the test data + */ + inline double percentile_95_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.95))];} + + /** + * @brief Convert the ModelRegressor into an output file + * + * @param filename The name of the file to output to + * @param train If true output the training data + * @param test_inds The indexes of the test set + */ + void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {}); + + #ifdef PY_BINDINGS + // DocString: model_reg_prop_train_est + /** + * @brief The estimation of the property + * @return _prop_train_est as a numpy ndarray + */ + inline np::ndarray prop_train_est(){return python_conv_utils::to_ndarray<double>(_prop_train_est);} + + // DocString: model_reg_prop_test_est + /** + * @brief The estimation of the properties test values + * @return _prop_test_est as a numpy ndarray + */ + inline np::ndarray prop_test_est(){return python_conv_utils::to_ndarray<double>(_prop_test_est);} + + // DocString: model_reg_train_error + /** + * @brief The training error + * @return _train_error as a numpy ndarray + */ + inline np::ndarray train_error(){return python_conv_utils::to_ndarray<double>(_train_error);} + + // DocString: model_reg_test_error + /** + * @brief The test error + * @return _test_error as a numpy ndarray + */ + inline np::ndarray test_error(){return python_conv_utils::to_ndarray<double>(_test_error);} + #endif +}; + +/** + * @brief Print a model to an string stream + * + * @param outStream The output stream the model is to be printed + * @param model The model to be printed + */ +std::ostream& operator<< (std::ostream& outStream, const ModelRegressor& model); + +#endif \ No newline at end of file diff --git a/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d24d8d5f1e69c3f7afb2de99ac6618db61d47d97 --- /dev/null +++ b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp @@ -0,0 +1,252 @@ +#include <descriptor_identifier/SISSO_DI/SISSOClassifier.hpp> + +SISSOClassifier::SISSOClassifier( + std::shared_ptr<FeatureSpace> feat_space, + Unit prop_unit, + std::vector<double> prop, + std::vector<double> prop_test, + std::vector<int> task_sizes_train, + std::vector<int> task_sizes_test, + std::vector<int> leave_out_inds, + int n_dim, + int n_residual, + int n_models_store, + bool fix_intercept +): + SISSO_DI(feat_space, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds, n_dim, n_residual, n_models_store, fix_intercept), + _training(task_sizes_test.size()), + _int_prop(prop.size()), + _int_prop_test(prop_test.size()), + _int_error(prop.size()), + _trainer(1, !fix_intercept), + _c(1.0) +{ + _trainer.stoppingCondition().maxSeconds = 0.01; + + double min_prop = *std::min_element(prop.begin(), prop.end()); + if(min_prop < 0.0) + { + std::transform(prop.begin(), prop.end(), prop.begin(), [&min_prop](double pp){return pp - min_prop;}); + std::transform(prop_test.begin(), prop_test.end(), prop_test.begin(), [&min_prop](double pp){return pp - min_prop;}); + } + std::vector<bool> not_int(prop.size()); + std::transform(prop.begin(), prop.end(), not_int.begin(), [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;}); + if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;})) + throw std::logic_error("For classification the property (training set) must be an integer"); + + not_int.resize(prop_test.size()); + std::transform(prop_test.begin(), prop_test.end(), not_int.begin(), [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;}); + if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;})) + throw std::logic_error("For classification the property (test set) must be an integer"); + + for(auto& pt : prop_test) + if(std::none_of(prop.begin(), prop.end(), [&pt](double pp){return std::abs(pp - pt) < 1e-10;})) + throw std::logic_error("A class in the property vector (test set) is not in the training set."); + + std::transform(_prop.begin(), _prop.end(), _prop.begin(), [&min_prop](double pp){return pp - min_prop;}); + std::transform(_prop_test.begin(), _prop_test.end(), _prop_test.begin(), [&min_prop](double pp){return pp - min_prop;}); + + for(int pp = 0; pp < _prop.size(); ++pp) + _int_prop[pp] = static_cast<unsigned int>(std::round(_prop[pp])); + + for(int pp = 0; pp < _prop_test.size(); ++pp) + _int_prop_test[pp] = static_cast<unsigned int>(std::round(_prop_test[pp])); + +} + +void SISSOClassifier::setup_training_data(int n_dim) +{ + std::vector<unsigned int> prop_to_copy; + int n_els = 0; + for(int tt = 0; tt < _task_sizes_train.size(); ++tt) + { + shark::Data<shark::RealVector> data(_task_sizes_train[tt], shark::RealVector(n_dim)); + shark::Data<unsigned int> labels(_task_sizes_train[tt], 0); + + for(int ii = 0; ii < _task_sizes_train[tt]; ++ii) + prop_to_copy[ii] = static_cast<unsigned int>(_prop[ii + n_els]); + + int n_copied = 0; + for(auto& batch : labels.batches()) + { + std::copy_n(prop_to_copy.begin() + n_copied, shark::batchSize(batch), &batch(0)); + n_els += shark::batchSize(batch); + n_copied += shark::batchSize(batch); + } + _training[tt] = shark::LabeledData<shark::RealVector, unsigned int>(data, labels); + } +} + +void SISSOClassifier::set_data(std::vector<int>& inds, int start, int task) +{ + int n_els = 0; + for(auto& batch: _training[task].inputs().batches()) + { + for(int ii = 0; ii < inds.size(); ++ii) + dcopy_(shark::batchSize(batch), node_value_arrs::get_d_matrix_ptr(inds[ii]) + start + n_els, 1, &batch(0, 0) + ii, inds.size()); + n_els += shark::batchSize(batch); + } +} + +void SISSOClassifier::svm(std::vector<int>& inds, double* coefs, int start, int task) +{ + set_data(inds, start, task); + _trainer.train(_model, _training[task]); + std::copy_n(_model.parameterVector().begin(), inds.size(), coefs); + std::copy_n(_model.bias().begin(), _model.bias().size(), coefs); +} + +void SISSOClassifier::l0_norm(std::vector<double>& prop, int n_dim) +{ + int n_get_models = std::max(_n_residual, _n_models_store); + std::vector<double> coefs(n_dim + 1, 0.0); + + std::vector<int> inds(n_dim, 0); + std::vector<int> inds_min(n_get_models * _n_dim, -1); + + std::vector<double> min_errors(n_get_models, util_funcs::norm(_prop.data(), _n_samp)); + std::vector<double> min_dist_errors(n_get_models, util_funcs::norm(_prop.data(), _n_samp)); + + for(int rr = 0; rr < n_dim; ++rr) + inds[rr] = _feat_space->phi_selected().size() - 1 - rr; + util_funcs::iterate(inds, inds.size(), _mpi_comm->rank()); + int max_error_ind = 0; + + std::vector<double> gamma(_prop.size(), 0.0); + std::vector<double> gamma_feat(_prop.size(), 0.0); + do { + int start = 0; + double error = 0.0; + double dist_error = 0.0; + // for(auto& sz : _task_sizes_train) + for(int tt = 0; tt < _task_sizes_train.size(); ++ tt) + { + svm(inds, coefs.data(), start, tt); + double norm = util_funcs::norm(coefs.data(), n_dim); + std::fill_n(gamma.begin(), _task_sizes_train[tt], (_fix_intercept ? 0.0 : coefs[n_dim]) / norm); + int n_els = 0; + for(auto& batch: _training[tt].inputs().batches()) + { + for(int ii = 0; ii < inds.size(); ++ii) + daxpy_(_task_sizes_train[tt], coefs[ii] / norm, &batch(0, 0) + ii, n_dim, gamma_feat.data() + n_els, 1); + n_els += shark::batchSize(batch); + } + std::transform(gamma_feat.begin(), gamma_feat.begin() + _task_sizes_train[tt], gamma.begin(), gamma.begin(), [](double d1, double d2){return std::abs(d1) + d2;}); + + error += _loss.eval(_training[tt].labels(), _model(_training[tt].inputs())) * _task_sizes_train[tt]; + dist_error += *std::min_element(gamma.begin(), gamma.end()); + start += _task_sizes_train[tt]; + } + dist_error /= static_cast<double>(_task_sizes_train.size()); + + if((error < min_errors[max_error_ind]) || ((error == min_errors[max_error_ind]) && (dist_error > min_dist_errors[max_error_ind]))) + { + min_errors[max_error_ind] = error; + min_dist_errors[max_error_ind] = dist_error; + std::copy_n(inds.begin(), inds.size(), inds_min.begin() + max_error_ind * n_dim); + + double max_dist = *std::max_element(min_dist_errors.begin(), min_dist_errors.end()) + 0.01; + daxpy_(n_get_models, -1.0 / max_dist, min_dist_errors.data(), 1, min_errors.data(), 1); + max_error_ind = std::max_element(min_errors.begin(), min_errors.end()) - min_errors.begin(); + daxpy_(n_get_models, 1.0 / max_dist, min_dist_errors.data(), 1, min_errors.data(), 1); + } + } while(util_funcs::iterate(inds, inds.size(), _mpi_comm->size())); + + std::vector<double> all_min_error(_mpi_comm->size() * n_get_models); + std::vector<double> all_min_dist_error(_mpi_comm->size() * n_get_models); + std::vector<int> all_inds_min(_mpi_comm->size() * n_get_models * n_dim); + + mpi::all_gather(*_mpi_comm, min_errors.data(), n_get_models, all_min_error); + mpi::all_gather(*_mpi_comm, min_dist_errors.data(), n_get_models, all_min_dist_error); + mpi::all_gather(*_mpi_comm, inds_min.data(), n_get_models * n_dim, all_inds_min); + + inds = util_funcs::argsort(all_min_error); + + std::vector<model_node_ptr> min_nodes(n_dim); + std::vector<ModelClassifier> models; + + for(int rr = 0; rr < n_get_models; ++rr) + { + node_value_arrs::clear_temp_test_reg(); + for(int ii = 0; ii < n_dim; ++ii) + { + int index = all_inds_min[inds[rr] * n_dim + ii]; + min_nodes[ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]->arr_ind(), _feat_space->phi_selected()[index]->rung(), _feat_space->phi_selected()[index]->expr(), _feat_space->phi_selected()[index]->postfix_expr(), _feat_space->phi_selected()[index]->value(), _feat_space->phi_selected()[index]->test_value(), _feat_space->phi_selected()[index]->unit()); + } + models.push_back(ModelClassifier(_prop_unit, _prop, _prop_test, min_nodes, _task_sizes_train, _task_sizes_test, _fix_intercept)); + } + + _models.push_back(models); +} + + +void SISSOClassifier::fit() +{ + std::vector<double> residual(_n_samp * _n_residual); + std::clock_t start; + double duration; + start = std::clock(); + + _feat_space->sis(_prop); + + _mpi_comm->barrier(); + duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; + if(_mpi_comm->rank() == 0) + std::cout << "Time for SIS: " << duration << std::endl; + + start = std::clock(); + + std::vector<ModelClassifier> models; + for(int rr = 0; rr < std::max(_n_residual, _n_models_store); ++rr) + { + node_value_arrs::clear_temp_test_reg(); + model_node_ptr model_feat = std::make_shared<ModelNode>(_feat_space->phi_selected()[rr]->arr_ind(), _feat_space->phi_selected()[rr]->rung(), _feat_space->phi_selected()[rr]->expr(), _feat_space->phi_selected()[rr]->postfix_expr(), _feat_space->phi_selected()[rr]->value(), _feat_space->phi_selected()[rr]->test_value(), _feat_space->phi_selected()[rr]->unit()); + models.push_back(ModelClassifier(_prop_unit, _prop, _prop_test, {model_feat}, _task_sizes_train, _task_sizes_test, _fix_intercept)); + if(rr < _n_residual) + models.back().copy_error(&residual[rr * _n_samp]); + + if((_mpi_comm->rank() == 0) && (rr < _n_models_store)) + { + models.back().to_file("models/train_dim_1_model_" + std::to_string(rr) + ".dat"); + if(_leave_out_inds.size() > 0) + models.back().to_file("models/test_dim_1_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds); + } + } + _models.push_back(models); + + _mpi_comm->barrier(); + duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; + if(_mpi_comm->rank() == 0) + std::cout << "Time for Model: " << duration << std::endl; + + for(int dd = 2; dd <= _n_dim; ++dd) + { + start = std::clock(); + _feat_space->sis(residual); + + _mpi_comm->barrier(); + duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; + if(_mpi_comm->rank() == 0) + std::cout << "Time for SIS: " << duration << std::endl; + + start = std::clock(); + l0_norm(_prop, dd); + + _mpi_comm->barrier(); + duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; + if(_mpi_comm->rank() == 0) + { + std::cout << "Time for l0-norm: " << duration << std::endl; + + for(int rr = 0; rr < _n_models_store; ++rr) + { + _models.back()[rr].to_file("models/train_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat"); + if(_leave_out_inds.size() > 0) + _models.back()[rr].to_file("models/test_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds); + } + } + for(int rr = 0; rr < _n_residual; ++rr) + _models.back()[rr].copy_error(&residual[rr * _n_samp]); + } +} + diff --git a/src/descriptor_identifier/SISSO_DI/SISSOClassifier.hpp b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.hpp new file mode 100644 index 0000000000000000000000000000000000000000..6e46c76ccb3d01488e216d58be2daf27d179d78e --- /dev/null +++ b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.hpp @@ -0,0 +1,183 @@ +/** @file descriptor_identifier/SISSOClassifier.hpp + * @brief Perform SISSO on a previously generated Feature Space + * + * Takes in a feature space and performs SISSO on it while storing the selected features in _models + * + * @author Thomas A. R. Purcell (tpurcell) + * @bug No known bugs. + */ +#ifndef SISSO_CLASSIFIER +#define SISSO_CLASSIFIER + +#include <shark/Data/Dataset.h> +#include <shark/ObjectiveFunctions/Loss/ZeroOneLoss.h> +#include <shark/Algorithms/Trainers/CSvmTrainer.h> + +#include <feature_creation/feature_space/FeatureSpace.hpp> +#include <descriptor_identifier/Model/ModelClassifier.hpp> +#include <descriptor_identifier/SISSO_DI/SISSO_DI.hpp> +#include <ctime> + +#ifdef PY_BINDINGS + namespace np = boost::python::numpy; + namespace py = boost::python; +#endif + +// DocString: cls_sisso_class +/** + * @brief SISSO Regressor class, performs the SISSO algorithm and stores all selected models + * + */ +class SISSOClassifier: public SISSO_DI +{ +protected: + std::vector<std::vector<ModelClassifier>> _models; //!< List of models + + std::vector<shark::LabeledData<shark::RealVector, unsigned int>> _training; //!< Training data set for all tasks + + std::vector<unsigned int> _int_prop; //!< Property array (training data) + std::vector<unsigned int> _int_prop_test; //!< Property array (testing data) + std::vector<int> _int_error; //!< Array to calculate the residuals for the models (training data) + shark::ZeroOneLoss<unsigned int> _loss; //!< The loss calculator + + shark::LinearCSvmTrainer<shark::RealVector> _trainer; //!< A list of training data for each task the SVM + shark::LinearClassifier<shark::RealVector> _model; //!< Model for the SVM classification problem + + double _c; + +public: + /** + * @brief Constructor for the Regressor + * + * @param feat_space The feature space to run SISSO on + * @param prop_unit The unit of the property + * @param prop Vector storing all data to train the SISSO models with + * @param prpo_test Vector storing all data to test the SISSO models with + * @param task_sizes_train Number of training samples per task + * @param task_sizes_test Number of testing samples per task + * @param leave_out_inds List of indexes from the initial data file in the test set + * @param n_dim Maximum dimensionality of the generated models + * @param n_residual Number of residuals to pass to the next SIS operation + * @param n_models_store Number of features to store in files + * @param fix_intrecept If true fix intercept to 0 + */ + SISSOClassifier(std::shared_ptr<FeatureSpace> feat_space, Unit prop_unit, std::vector<double> prop, std::vector<double> prop_test, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, std::vector<int> leave_out_inds, int n_dim, int n_residual, int n_models_store, bool fix_intercept=false); + + /** + * @brief Get the optimal size of the working array + * + * @param n_dim Dimension of the least squares matrix + * @return Optimal size of the working array + */ + void setup_training_data(int n_dim); + + /** + * @brief Preform Least squares optimization + * + * @param inds Feature indexes to get the model of + * @param coeffs Coefficients for the model + * @param start The index in the property and feature vectors start copying into _b and _a + * @param n_samp number of samples to perform least squares optimization on + */ + void svm(std::vector<int>& inds, double* coeffs, int start, int n_samp); + + /** + * @brief Set the data object for the given set of inds + * + * @param inds indexes of the selected features + * @param start The index in the property and feature vectors start copying into _b and _a + * @param n_samp number of samples to perform least squares optimization on + */ + void set_data(std::vector<int>& inds, int start, int task); + + /** + * @brief Preform the l0 normalization for a property or the residual + * + * @param prop The property to fit + * @param n_dim the dimensionality of the model + */ + void l0_norm(std::vector<double>& prop, int n_dim); + + // DocString: sisso_class_fit + /** + * @brief Perform SISSO to generate the models + * @details Iteratively pefrom SISSO on the Feature space and property until the model dimensionality is equal to _n_dim + */ + void fit(); + + /** + * @brief Acessor function for models + */ + inline std::vector<std::vector<ModelClassifier>> models(){return _models;} + + // Python interface functions + #ifdef PY_BINDINGS + /** + * @brief Constructor for the Regressor that takes in python objects (cpp definition in <python/descriptor_identifier/SISSOClassifier.cpp) + * + * @param feat_space The feature space to run SISSO on + * @param prop_unit The unit of the property + * @param prop Vector storing all data to train the SISSO models with + * @param prpo_test Vector storing all data to test the SISSO models with + * @param task_sizes_train Number of training samples per task + * @param task_sizes_test Number of testing samples per task + * @param leave_out_inds List of indexes from the initial data file in the test set + * @param n_dim Maximum dimensionality of the generated models + * @param n_residual Number of residuals to pass to the next SIS operation + * @param n_models_store Number of features to store in files + * @param fix_intrecept If true fix intercept to 0 + */ + SISSOClassifier( + std::shared_ptr<FeatureSpace> feat_space, + Unit prop_unit, + np::ndarray prop, + np::ndarray prop_test, + py::list task_sizes_train, + py::list task_sizes_test, + py::list leave_out_inds, + int n_dim, + int n_residual, + int n_models_store, + bool fix_intercept=false + ); + + // DocString: sisso_class_init_list + /** + * @brief Constructor for the Regressor that takes in python objects (cpp definition in <python/descriptor_identifier/SISSOClassifier.cpp) + * + * @param feat_space The feature space to run SISSO on + * @param prop_unit The unit of the property + * @param prop Vector storing all data to train the SISSO models with + * @param prpo_test Vector storing all data to test the SISSO models with + * @param task_sizes_train Number of training samples per task + * @param task_sizes_test Number of testing samples per task + * @param leave_out_inds List of indexes from the initial data file in the test set + * @param n_dim Maximum dimensionality of the generated models + * @param n_residual Number of residuals to pass to the next SIS operation + * @param n_models_store Number of features to store in files + * @param fix_intrecept If true fix intercept to 0 + */ + SISSOClassifier( + std::shared_ptr<FeatureSpace> feat_space, + Unit prop_unit, + py::list prop, + py::list prop_test, + py::list task_sizes_train, + py::list task_sizes_test, + py::list leave_out_inds, + int n_dim, + int n_residual, + int n_models_store, + bool fix_intercept=false + ); + + // DocString: sisso_class_models_py + /** + * @brief The selected models (cpp definition in <python/descriptor_identifier/SISSO_DI.cpp) + * @return models as a python list + */ + py::list models_py(); + #endif +}; + +#endif diff --git a/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp b/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp index e7009a783dba97b6d2b06b201c933ec87c3830f9..828129ebcdd26c9ff49921c3005a079d48979b3c 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp +++ b/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp @@ -125,7 +125,7 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) inds = util_funcs::argsort(all_min_error); std::vector<model_node_ptr> min_nodes(n_dim); - std::vector<Model> models; + std::vector<ModelRegressor> models; for(int rr = 0; rr < n_get_models; ++rr) { @@ -135,8 +135,79 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) int index = all_inds_min[inds[rr] * n_dim + ii]; min_nodes[ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]->arr_ind(), _feat_space->phi_selected()[index]->rung(), _feat_space->phi_selected()[index]->expr(), _feat_space->phi_selected()[index]->postfix_expr(), _feat_space->phi_selected()[index]->value(), _feat_space->phi_selected()[index]->test_value(), _feat_space->phi_selected()[index]->unit()); } - models.push_back(Model(_prop_unit, _prop, _prop_test, min_nodes, _task_sizes_train, _task_sizes_test, _fix_intercept)); + models.push_back(ModelRegressor(_prop_unit, _prop, _prop_test, min_nodes, _task_sizes_train, _task_sizes_test, _fix_intercept)); } _models.push_back(models); } + + +void SISSORegressor::fit() +{ + std::vector<double> residual(_n_samp * _n_residual); + std::clock_t start; + double duration; + start = std::clock(); + + _feat_space->sis(_prop); + + _mpi_comm->barrier(); + duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; + if(_mpi_comm->rank() == 0) + std::cout << "Time for SIS: " << duration << std::endl; + + start = std::clock(); + + std::vector<ModelRegressor> models; + for(int rr = 0; rr < std::max(_n_residual, _n_models_store); ++rr) + { + node_value_arrs::clear_temp_test_reg(); + model_node_ptr model_feat = std::make_shared<ModelNode>(_feat_space->phi_selected()[rr]->arr_ind(), _feat_space->phi_selected()[rr]->rung(), _feat_space->phi_selected()[rr]->expr(), _feat_space->phi_selected()[rr]->postfix_expr(), _feat_space->phi_selected()[rr]->value(), _feat_space->phi_selected()[rr]->test_value(), _feat_space->phi_selected()[rr]->unit()); + models.push_back(ModelRegressor(_prop_unit, _prop, _prop_test, {model_feat}, _task_sizes_train, _task_sizes_test, _fix_intercept)); + if(rr < _n_residual) + models.back().copy_error(&residual[rr * _n_samp]); + + if((_mpi_comm->rank() == 0) && (rr < _n_models_store)) + { + models.back().to_file("models/train_dim_1_model_" + std::to_string(rr) + ".dat"); + if(_leave_out_inds.size() > 0) + models.back().to_file("models/test_dim_1_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds); + } + } + _models.push_back(models); + + _mpi_comm->barrier(); + duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; + if(_mpi_comm->rank() == 0) + std::cout << "Time for Model: " << duration << std::endl; + + for(int dd = 2; dd <= _n_dim; ++dd) + { + start = std::clock(); + _feat_space->sis(residual); + + _mpi_comm->barrier(); + duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; + if(_mpi_comm->rank() == 0) + std::cout << "Time for SIS: " << duration << std::endl; + + start = std::clock(); + l0_norm(_prop, dd); + + _mpi_comm->barrier(); + duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; + if(_mpi_comm->rank() == 0) + { + std::cout << "Time for l0-norm: " << duration << std::endl; + + for(int rr = 0; rr < _n_models_store; ++rr) + { + _models.back()[rr].to_file("models/train_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat"); + if(_leave_out_inds.size() > 0) + _models.back()[rr].to_file("models/test_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds); + } + } + for(int rr = 0; rr < _n_residual; ++rr) + _models.back()[rr].copy_error(&residual[rr * _n_samp]); + } +} diff --git a/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp b/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp index ee62cb5fe7ac65d395774a1a98188b9a6bcea7ee..939994dfb8e3cbb9163268b4986974086b57f506 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp +++ b/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp @@ -10,7 +10,7 @@ #define SISSO_REGRESSOR #include <feature_creation/feature_space/FeatureSpace.hpp> -#include <descriptor_identifier/Model/Model.hpp> +#include <descriptor_identifier/Model/ModelRegressor.hpp> #include <descriptor_identifier/SISSO_DI/SISSO_DI.hpp> #include <ctime> @@ -27,6 +27,8 @@ class SISSORegressor: public SISSO_DI { protected: + std::vector<std::vector<ModelRegressor>> _models; //!< List of models + std::vector<double> _a; //!< A matrix for least squares std::vector<double> _b; //!< Solution array for least squares std::vector<double> _work; //!< The work array for least squares problems @@ -94,6 +96,18 @@ public: */ void l0_norm(std::vector<double>& prop, int n_dim); + // DocString: sisso_reg_fit + /** + * @brief Perform SISSO to generate the models + * @details Iteratively pefrom SISSO on the Feature space and property until the model dimensionality is equal to _n_dim + */ + void fit(); + + /** + * @brief Acessor function for models + */ + inline std::vector<std::vector<ModelRegressor>> models(){return _models;} + // Python interface functions #ifdef PY_BINDINGS /** @@ -154,6 +168,13 @@ public: int n_models_store, bool fix_intercept=false ); + + // DocString: sisso_reg_models_py + /** + * @brief The selected models (cpp definition in <python/descriptor_identifier/SISSO_DI.cpp) + * @return models as a python list + */ + py::list models_py(); #endif }; diff --git a/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp b/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp index 9e9dabe784b2ee230536894299863c8d9e87612c..3556902b0971bf84b07c97c8769ea5bc95c05116 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp +++ b/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp @@ -28,74 +28,3 @@ SISSO_DI::SISSO_DI( _n_models_store(n_models_store), _fix_intercept(fix_intercept) {} - - -void SISSO_DI::fit() -{ - std::vector<double> residual(_n_samp * _n_residual); - std::clock_t start; - double duration; - start = std::clock(); - - _feat_space->sis(_prop); - - _mpi_comm->barrier(); - duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - if(_mpi_comm->rank() == 0) - std::cout << "Time for SIS: " << duration << std::endl; - - start = std::clock(); - - std::vector<Model> models; - for(int rr = 0; rr < std::max(_n_residual, _n_models_store); ++rr) - { - node_value_arrs::clear_temp_test_reg(); - model_node_ptr model_feat = std::make_shared<ModelNode>(_feat_space->phi_selected()[rr]->arr_ind(), _feat_space->phi_selected()[rr]->rung(), _feat_space->phi_selected()[rr]->expr(), _feat_space->phi_selected()[rr]->postfix_expr(), _feat_space->phi_selected()[rr]->value(), _feat_space->phi_selected()[rr]->test_value(), _feat_space->phi_selected()[rr]->unit()); - models.push_back(Model(_prop_unit, _prop, _prop_test, {model_feat}, _task_sizes_train, _task_sizes_test, _fix_intercept)); - if(rr < _n_residual) - models.back().copy_error(&residual[rr * _n_samp]); - - if((_mpi_comm->rank() == 0) && (rr < _n_models_store)) - { - models.back().to_file("models/train_dim_1_model_" + std::to_string(rr) + ".dat"); - if(_leave_out_inds.size() > 0) - models.back().to_file("models/test_dim_1_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds); - } - } - _models.push_back(models); - - _mpi_comm->barrier(); - duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - if(_mpi_comm->rank() == 0) - std::cout << "Time for Model: " << duration << std::endl; - - for(int dd = 2; dd <= _n_dim; ++dd) - { - start = std::clock(); - _feat_space->sis(residual); - - _mpi_comm->barrier(); - duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - if(_mpi_comm->rank() == 0) - std::cout << "Time for SIS: " << duration << std::endl; - - start = std::clock(); - l0_norm(_prop, dd); - - _mpi_comm->barrier(); - duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; - if(_mpi_comm->rank() == 0) - { - std::cout << "Time for l0-norm: " << duration << std::endl; - - for(int rr = 0; rr < _n_models_store; ++rr) - { - _models.back()[rr].to_file("models/train_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat"); - if(_leave_out_inds.size() > 0) - _models.back()[rr].to_file("models/test_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds); - } - } - for(int rr = 0; rr < _n_residual; ++rr) - _models.back()[rr].copy_error(&residual[rr * _n_samp]); - } -} diff --git a/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp b/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp index ddf46e7614db3042922f842ed3842b9b10428941..4407575408fcadab1b071abb0ab610bdc0db3bdc 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp +++ b/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp @@ -26,8 +26,6 @@ class SISSO_DI { protected: - std::vector<std::vector<Model>> _models; //!< List of models - std::vector<double> _prop; //!< Property array (training data) std::vector<double> _prop_test; //!< Property array (testing data) std::vector<double> _error; //!< Array to calculate the residuals for the models (training data) @@ -65,21 +63,11 @@ public: */ SISSO_DI(std::shared_ptr<FeatureSpace> feat_space, Unit prop_unit, std::vector<double> prop, std::vector<double> prop_test, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, std::vector<int> leave_out_inds, int n_dim, int n_residual, int n_models_store, bool fix_intercept=false); - /** - * @brief Set the residual for the next step - * - * @param inds indexes of the selected features - * @param coeffs Coefficients of the model - * @param start The index in the property and feature vectors start copying into _b and _a - * @param n_samp number of samples to perform least squares optimization on - */ - virtual void set_error(std::vector<int>& inds, double* coeffs, int start, int n_samp) = 0; - /** * @brief Perform SISSO to generate the models * @details Iteratively pefrom SISSO on the Feature space and property until the model dimensionality is equal to _n_dim */ - void fit(); + virtual void fit() = 0; /** * @brief Preform the l0 normalization for a property or the residual @@ -109,11 +97,6 @@ public: */ inline std::vector<double> error(){return _error;} - /** - * @brief Acessor function for models - */ - inline std::vector<std::vector<Model>> models(){return _models;} - // DocString: sisso_di_n_samp /** * @brief The number of samples per feature @@ -199,13 +182,6 @@ public: bool fix_intercept=false ); - // DocString: sisso_di_models_py - /** - * @brief The selected models (cpp definition in <python/descriptor_identifier/SISSO_DI.cpp) - * @return models as a python list - */ - py::list models_py(); - // DocString: sisso_di_prop_py /** * @brief The property to be learned diff --git a/src/python/bindings_docstring_keyed.cpp b/src/python/bindings_docstring_keyed.cpp index b9c06e412738d392eee94ab67b460c215f1ba360..f055ea4bff833d05a0730955cb31fb9a2bae196b 100644 --- a/src/python/bindings_docstring_keyed.cpp +++ b/src/python/bindings_docstring_keyed.cpp @@ -5,8 +5,11 @@ using namespace boost::python; void sisso::register_all() { sisso::descriptor_identifier::registerModel(); + sisso::descriptor_identifier::registerModelRegressor(); + sisso::descriptor_identifier::registerModelClassifier(); sisso::descriptor_identifier::registerSISSO_DI(); sisso::descriptor_identifier::registerSISSORegressor(); + sisso::descriptor_identifier::registerSISSOClassifier(); sisso::feature_creation::registerFeatureSpace(); sisso::feature_creation::registerUnit(); sisso::feature_creation::node::registerNode(); @@ -341,49 +344,69 @@ void sisso::feature_creation::node::registerSixPowNode() void sisso::descriptor_identifier::registerModel() { - class_<Model>("Model", init<Unit, std::vector<double>, std::vector<double>, std::vector<model_node_ptr>, std::vector<int>, std::vector<int>, bool>()) - .def(init<std::string>()) - .def(init<std::string, std::string>()) - .def("__str__", &Model::toString, "@DocString_model_str@") - .def("__repr__", &Model::toString, "@DocString_model_str@") + class_<sisso::descriptor_identifier::Model_Wrap, boost::noncopyable>("Model", no_init) .add_property("n_samp_train", &Model::n_samp_train, "@DocString_model_n_samp_train@") .add_property("n_samp_test", &Model::n_samp_test, "@DocString_model_n_samp_test@") .add_property("n_dim", &Model::n_dim, "@DocString_model_n_dim@") - .add_property("fit", &Model::prop_train_est, "@DocString_model_prop_train_est@") - .add_property("predict", &Model::prop_test_est, "@DocString_model_prop_test_est@") .add_property("prop_train", &Model::prop_train, "@DocString_model_prop_train@") .add_property("prop_test", &Model::prop_test, "@DocString_model_prop_test@") - .add_property("train_error", &Model::train_error, "@DocString_model_train_error@") - .add_property("test_error", &Model::test_error, "@DocString_model_test_error@") .add_property("feats", &Model::feats, "@DocString_model_feats@") .add_property("coefs", &Model::coefs, "@DocString_model_coefs@") - .add_property("rmse", &Model::rmse, "@DocString_model_rmse@") - .add_property("test_rmse", &Model::test_rmse, "@DocString_model_test_rmse@") - .add_property("max_ae", &Model::max_ae, "@DocString_model_max_ae@") - .add_property("test_max_ae", &Model::test_max_ae, "@DocString_model_test_max_ae@") - .add_property("mae", &Model::mae, "@DocString_model_mae@") - .add_property("test_mae", &Model::test_mae, "@DocString_model_test_mae@") - .add_property("mape", &Model::mape, "@DocString_model_mape@") - .add_property("test_mape", &Model::test_mape, "@DocString_model_test_mape@") - .add_property("percentile_25_ae", &Model::percentile_25_ae, "@DocString_model_percentile_25_ae@") - .add_property("percentile_25_test_ae", &Model::percentile_25_test_ae, "@DocString_model_test_percentile_25_test_ae@") - .add_property("percentile_50_ae", &Model::percentile_50_ae, "@DocString_model_percentile_50_ae@") - .add_property("percentile_50_test_ae", &Model::percentile_50_test_ae, "@DocString_model_test_percentile_50_test_ae@") - .add_property("percentile_75_ae", &Model::percentile_75_ae, "@DocString_model_percentile_75_ae@") - .add_property("percentile_75_test_ae", &Model::percentile_75_test_ae, "@DocString_model_test_percentile_75_test_ae@") - .add_property("percentile_95_ae", &Model::percentile_95_ae, "@DocString_model_percentile_95_ae@") - .add_property("percentile_95_test_ae", &Model::percentile_95_ae, "@DocString_model_test_percentile_95_ae@") .add_property("prop_unit", &Model::prop_unit, "@DocString_model_prop_unit@") ; } +void sisso::descriptor_identifier::registerModelRegressor() +{ + class_<ModelRegressor, bases<Model>>("ModelRegressor", init<Unit, std::vector<double>, std::vector<double>, std::vector<model_node_ptr>, std::vector<int>, std::vector<int>, bool>()) + .def(init<std::string>()) + .def(init<std::string, std::string>()) + .def("__str__", &ModelRegressor::toString, "@DocString_model_reg_str@") + .def("__repr__", &ModelRegressor::toString, "@DocString_model_reg_str@") + .add_property("fit", &ModelRegressor::prop_train_est, "@DocString_model_reg_prop_train_est@") + .add_property("predict", &ModelRegressor::prop_test_est, "@DocString_model_reg_prop_test_est@") + .add_property("train_error", &ModelRegressor::train_error, "@DocString_model_reg_train_error@") + .add_property("test_error", &ModelRegressor::test_error, "@DocString_model_reg_test_error@") + .add_property("rmse", &ModelRegressor::rmse, "@DocString_model_reg_rmse@") + .add_property("test_rmse", &ModelRegressor::test_rmse, "@DocString_model_reg_test_rmse@") + .add_property("max_ae", &ModelRegressor::max_ae, "@DocString_model_reg_max_ae@") + .add_property("test_max_ae", &ModelRegressor::test_max_ae, "@DocString_model_reg_test_max_ae@") + .add_property("mae", &ModelRegressor::mae, "@DocString_model_reg_mae@") + .add_property("test_mae", &ModelRegressor::test_mae, "@DocString_model_reg_test_mae@") + .add_property("mape", &ModelRegressor::mape, "@DocString_model_reg_mape@") + .add_property("test_mape", &ModelRegressor::test_mape, "@DocString_model_reg_test_mape@") + .add_property("percentile_25_ae", &ModelRegressor::percentile_25_ae, "@DocString_model_reg_percentile_25_ae@") + .add_property("percentile_25_test_ae", &ModelRegressor::percentile_25_test_ae, "@DocString_model_reg_test_percentile_25_test_ae@") + .add_property("percentile_50_ae", &ModelRegressor::percentile_50_ae, "@DocString_model_reg_percentile_50_ae@") + .add_property("percentile_50_test_ae", &ModelRegressor::percentile_50_test_ae, "@DocString_model_reg_test_percentile_50_test_ae@") + .add_property("percentile_75_ae", &ModelRegressor::percentile_75_ae, "@DocString_model_reg_percentile_75_ae@") + .add_property("percentile_75_test_ae", &ModelRegressor::percentile_75_test_ae, "@DocString_model_reg_test_percentile_75_test_ae@") + .add_property("percentile_95_ae", &ModelRegressor::percentile_95_ae, "@DocString_model_reg_percentile_95_ae@") + .add_property("percentile_95_test_ae", &ModelRegressor::percentile_95_ae, "@DocString_model_reg_test_percentile_95_ae@") + ; +} + +void sisso::descriptor_identifier::registerModelClassifier() +{ + class_<ModelClassifier, bases<Model>>("ModelClassifier", init<Unit, std::vector<double>, std::vector<double>, std::vector<model_node_ptr>, std::vector<int>, std::vector<int>, bool>()) + .def(init<std::string>()) + .def(init<std::string, std::string>()) + .def("__str__", &ModelClassifier::toString, "@DocString_model_class_str@") + .def("__repr__", &ModelClassifier::toString, "@DocString_model_class_str@") + .add_property("fit", &ModelClassifier::prop_train_est, "@DocString_model_class_prop_train_est@") + .add_property("predict", &ModelClassifier::prop_test_est, "@DocString_model_class_prop_test_est@") + .add_property("train_error", &ModelClassifier::train_error, "@DocString_model_class_train_error@") + .add_property("test_error", &ModelClassifier::test_error, "@DocString_model_class_test_error@") + .add_property("percent_error", &ModelClassifier::percent_train_error, "@DocString_model_class_precent_train_error@") + .add_property("percent_test_error", &ModelClassifier::percent_test_error, "@DocString_model_class_precent_test_error@") + ; +} + void sisso::descriptor_identifier::registerSISSO_DI() { class_<sisso::descriptor_identifier::SISSO_DI_Wrap, boost::noncopyable>("SISSO_DI", no_init) - .def("fit", &SISSO_DI::fit, "@DocString_sisso_reg_fit@") .add_property("prop", &SISSO_DI::prop_py, "@DocString_sisso_reg_prop_py@") .add_property("prop_test", &SISSO_DI::prop_test_py, "@DocString_sisso_reg_prop_test_py@") - .add_property("models", &SISSO_DI::models_py, "@DocString_sisso_reg_models_py@") .add_property("n_samp", &SISSO_DI::n_samp, "@DocString_sisso_reg_n_samp@") .add_property("n_dim", &SISSO_DI::n_dim, "@DocString_sisso_reg_n_dim@") .add_property("n_residual", &SISSO_DI::n_residual, "@DocString_sisso_reg_n_residual@") @@ -397,6 +420,17 @@ void sisso::descriptor_identifier::registerSISSO_DI() void sisso::descriptor_identifier::registerSISSORegressor() { class_<SISSORegressor, bases<SISSO_DI>>("SISSORegressor", init<std::shared_ptr<FeatureSpace>, Unit, np::ndarray, np::ndarray, py::list, py::list, py::list, int, int, int, optional<bool>>()) + .def("fit", &SISSORegressor::fit, "@DocString_sisso_reg_fit@") + .def(init<std::shared_ptr<FeatureSpace>, Unit, py::list, py::list, py::list, py::list, py::list, int, int, int, optional<bool>>()) + .add_property("models", &SISSORegressor::models_py, "@DocString_sisso_reg_models_py@") + ; +} + +void sisso::descriptor_identifier::registerSISSOClassifier() +{ + class_<SISSOClassifier, bases<SISSO_DI>>("SISSOClassifier", init<std::shared_ptr<FeatureSpace>, Unit, np::ndarray, np::ndarray, py::list, py::list, py::list, int, int, int, optional<bool>>()) + .def("fit", &SISSOClassifier::fit, "@DocString_sisso_class_fit@") .def(init<std::shared_ptr<FeatureSpace>, Unit, py::list, py::list, py::list, py::list, py::list, int, int, int, optional<bool>>()) + .add_property("models", &SISSOClassifier::models_py, "@DocString_sisso_class_models_py@") ; } diff --git a/src/python/bindings_docstring_keyed.hpp b/src/python/bindings_docstring_keyed.hpp index ecf45ded7962ff04fc16096a4af04554fbafc74a..05048f0262058cf798647f72798634b606024602 100644 --- a/src/python/bindings_docstring_keyed.hpp +++ b/src/python/bindings_docstring_keyed.hpp @@ -9,6 +9,7 @@ #include <descriptor_identifier/SISSO_DI/SISSO_DI.hpp> #include <descriptor_identifier/SISSO_DI/SISSORegressor.hpp> +#include <descriptor_identifier/SISSO_DI/SISSOClassifier.hpp> #include <feature_creation/feature_space/FeatureSpace.hpp> #include <python/feature_creation/node_utils.hpp> @@ -200,8 +201,13 @@ namespace sisso { struct SISSO_DI_Wrap : SISSO_DI, py::wrapper<SISSO_DI> { - inline void set_error(){this->get_override("set_error")();} inline void l0_norm(){this->get_override("l0_norm")();} + inline void fit(){this->get_override("fit")();} + }; + + struct Model_Wrap : Model, py::wrapper<Model> + { + inline void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {}){this->get_override("to_file")();} }; /** @@ -209,6 +215,16 @@ namespace sisso */ static void registerModel(); + /** + * @brief Register the Model object for conversion to a python object + */ + static void registerModelRegressor(); + + /** + * @brief Register the Model object for conversion to a python object + */ + static void registerModelClassifier(); + /** * @brief Register the SISSORegressor object for conversion to a python object */ @@ -218,6 +234,12 @@ namespace sisso * @brief Register the SISSORegressor object for conversion to a python object */ static void registerSISSORegressor(); + + + /** + * @brief Register the SISSORegressor object for conversion to a python object + */ + static void registerSISSOClassifier(); } } diff --git a/src/python/descriptor_identifier/SISSOClassifier.cpp b/src/python/descriptor_identifier/SISSOClassifier.cpp new file mode 100644 index 0000000000000000000000000000000000000000..89332a00710197b1f5890ca183a9436edb339d68 --- /dev/null +++ b/src/python/descriptor_identifier/SISSOClassifier.cpp @@ -0,0 +1,123 @@ +#include <descriptor_identifier/SISSO_DI/SISSOClassifier.hpp> + +SISSOClassifier::SISSOClassifier( + std::shared_ptr<FeatureSpace> feat_space, + Unit prop_unit, + np::ndarray prop, + np::ndarray prop_test, + py::list task_sizes_train, + py::list task_sizes_test, + py::list leave_out_inds, + int n_dim, + int n_residual, + int n_models_store, + bool fix_intercept +) : + SISSO_DI(feat_space, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds, n_dim, n_residual, n_models_store, fix_intercept), + _training(_task_sizes_train.size()), + _int_prop(_prop.size()), + _int_prop_test(_prop_test.size()), + _int_error(_prop.size()), + _trainer(1, !fix_intercept), + _c(1.0) +{ + std::vector<double> prop_vec = python_conv_utils::from_ndarray<double>(prop); + std::vector<double> prop_test_vec = python_conv_utils::from_ndarray<double>(prop_test); + + _trainer.stoppingCondition().maxSeconds = 0.01; + + double min_prop = *std::min_element(prop_vec.begin(), prop_vec.end()); + if(min_prop < 0.0) + { + std::transform(prop_vec.begin(), prop_vec.end(), prop_vec.begin(), [&min_prop](double pp){return pp - min_prop;}); + std::transform(prop_test_vec.begin(), prop_test_vec.end(), prop_test_vec.begin(), [&min_prop](double pp){return pp - min_prop;}); + } + std::vector<bool> not_int(prop_vec.size()); + std::transform(prop_vec.begin(), prop_vec.end(), not_int.begin(), [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;}); + if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;})) + throw std::logic_error("For classification the property (training set) must be an integer"); + + not_int.resize(prop_test_vec.size()); + std::transform(prop_test_vec.begin(), prop_test_vec.end(), not_int.begin(), [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;}); + if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;})) + throw std::logic_error("For classification the property (test set) must be an integer"); + + for(auto& pt : prop_test_vec) + if(std::none_of(prop_vec.begin(), prop_vec.end(), [&pt](double pp){return std::abs(pp - pt) < 1e-10;})) + throw std::logic_error("A class in the property vector (test set) is not in the training set."); + + std::transform(_prop.begin(), _prop.end(), _prop.begin(), [&min_prop](double pp){return pp - min_prop;}); + std::transform(_prop_test.begin(), _prop_test.end(), _prop_test.begin(), [&min_prop](double pp){return pp - min_prop;}); + + for(int pp = 0; pp < _prop.size(); ++pp) + _int_prop[pp] = static_cast<unsigned int>(std::round(_prop[pp])); + + for(int pp = 0; pp < _prop_test.size(); ++pp) + _int_prop_test[pp] = static_cast<unsigned int>(std::round(_prop_test[pp])); +} + +SISSOClassifier::SISSOClassifier( + std::shared_ptr<FeatureSpace> feat_space, + Unit prop_unit, + py::list prop, + py::list prop_test, + py::list task_sizes_train, + py::list task_sizes_test, + py::list leave_out_inds, + int n_dim, + int n_residual, + int n_models_store, + bool fix_intercept +) : + SISSO_DI(feat_space, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds, n_dim, n_residual, n_models_store, fix_intercept), + _training(_task_sizes_train.size()), + _int_prop(_prop.size()), + _int_prop_test(_prop_test.size()), + _int_error(_prop.size()), + _trainer(1, !fix_intercept), + _c(1.0) +{ + std::vector<double> prop_vec = python_conv_utils::from_list<double>(prop); + std::vector<double> prop_test_vec = python_conv_utils::from_list<double>(prop_test); + + _trainer.stoppingCondition().maxSeconds = 0.01; + + double min_prop = *std::min_element(prop_vec.begin(), prop_vec.end()); + if(min_prop < 0.0) + { + std::transform(prop_vec.begin(), prop_vec.end(), prop_vec.begin(), [&min_prop](double pp){return pp - min_prop;}); + std::transform(prop_test_vec.begin(), prop_test_vec.end(), prop_test_vec.begin(), [&min_prop](double pp){return pp - min_prop;}); + } + std::vector<bool> not_int(prop_vec.size()); + std::transform(prop_vec.begin(), prop_vec.end(), not_int.begin(), [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;}); + if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;})) + throw std::logic_error("For classification the property (training set) must be an integer"); + + not_int.resize(prop_test_vec.size()); + std::transform(prop_test_vec.begin(), prop_test_vec.end(), not_int.begin(), [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;}); + if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;})) + throw std::logic_error("For classification the property (test set) must be an integer"); + + for(auto& pt : prop_test_vec) + if(std::none_of(prop_vec.begin(), prop_vec.end(), [&pt](double pp){return std::abs(pp - pt) < 1e-10;})) + throw std::logic_error("A class in the property vector (test set) is not in the training set."); + + for(int pp = 0; pp < _prop.size(); ++pp) + _int_prop[pp] = static_cast<unsigned int>(std::round(_prop[pp] - min_prop)); + + for(int pp = 0; pp < _prop_test.size(); ++pp) + _int_prop_test[pp] = static_cast<unsigned int>(std::round(_prop_test[pp] - min_prop)); +} + +py::list SISSOClassifier::models_py() +{ + py::list model_list; + for(auto& mod_list : _models) + { + py::list lst; + for(auto& mod : mod_list) + lst.append<ModelClassifier>(mod); + model_list.append<py::list>(lst); + } + return model_list; +} diff --git a/src/python/descriptor_identifier/SISSORegressor.cpp b/src/python/descriptor_identifier/SISSORegressor.cpp index 96a3bc454bda6a7a62ea9e03921b171d875c069a..d7bccdd35892416238a323d9ef055f5f2548cadd 100644 --- a/src/python/descriptor_identifier/SISSORegressor.cpp +++ b/src/python/descriptor_identifier/SISSORegressor.cpp @@ -54,3 +54,16 @@ SISSORegressor::SISSORegressor( _lwork = get_opt_lwork(_n_dim + 1); _work = std::vector<double>(_lwork, 0.0); } + +py::list SISSORegressor::models_py() +{ + py::list model_list; + for(auto& mod_list : _models) + { + py::list lst; + for(auto& mod : mod_list) + lst.append<ModelRegressor>(mod); + model_list.append<py::list>(lst); + } + return model_list; +} diff --git a/src/python/descriptor_identifier/SISSO_DI.cpp b/src/python/descriptor_identifier/SISSO_DI.cpp index b8d8bf6f3cb23a4d727cda71ac5829132fb83bd2..3a4456b9fe9692e058e2a70464f64b1f09cb4334 100644 --- a/src/python/descriptor_identifier/SISSO_DI.cpp +++ b/src/python/descriptor_identifier/SISSO_DI.cpp @@ -61,16 +61,3 @@ SISSO_DI::SISSO_DI( { node_value_arrs::initialize_d_matrix_arr(); } - -py::list SISSO_DI::models_py() -{ - py::list model_list; - for(auto& mod_list : _models) - { - py::list lst; - for(auto& mod : mod_list) - lst.append<Model>(mod); - model_list.append<py::list>(lst); - } - return model_list; -} diff --git a/tests/test_descriptor_identifier/test_model.py b/tests/test_descriptor_identifier/test_model.py index bbd682c41ca156246c1374595e8b3a744550119b..0f156dbab5cc422d78c9d2f90c75b270618ea827 100644 --- a/tests/test_descriptor_identifier/test_model.py +++ b/tests/test_descriptor_identifier/test_model.py @@ -1,4 +1,4 @@ -from cpp_sisso import Model +from cpp_sisso import ModelRegressor from pathlib import Path import numpy as np @@ -12,7 +12,7 @@ parent = Path(__file__).parent def test_model(): - model = Model( + model = ModelRegressor( str(parent / "model_files/train.dat"), str(parent / "model_files/test.dat") ) assert np.all(np.abs(model.fit - model.prop_train) < 1e-7)