From 29b01178269fe92e78518458ed89f7c08cd70889 Mon Sep 17 00:00:00 2001 From: Thomas Purcell <purcell@fhi-berlin.mpg.de> Date: Sun, 11 Oct 2020 16:11:22 +0200 Subject: [PATCH] LogRegressor appears to be working Added a LogRegressor and did some refactoring to make use of ModelRegerssors --- CMakeLists.txt | 52 ++++-- cmake/CoinUtils/coin_utils_configure.sh | 2 + cmake/coin-Clp/clp_configure.sh | 2 +- src/CMakeLists.txt | 4 +- .../Model/ModelLogRegressor.cpp | 116 +++++++++++++ .../Model/ModelLogRegressor.hpp | 122 ++++++++++++++ .../Model/ModelRegressor.hpp | 5 +- .../SISSO_DI/SISSOLogRegressor.cpp | 69 ++++++++ .../SISSO_DI/SISSOLogRegressor.hpp | 157 ++++++++++++++++++ .../SISSO_DI/SISSORegressor.cpp | 75 +++++---- .../SISSO_DI/SISSORegressor.hpp | 16 +- .../feature_space/FeatureSpace.cpp | 5 + src/main.cpp | 19 +++ src/python/bindings_docstring_keyed.cpp | 21 +++ src/python/bindings_docstring_keyed.hpp | 10 ++ .../SISSOLogRegressor.cpp | 57 +++++++ src/utils/math_funcs.cpp | 33 ++++ src/utils/math_funcs.hpp | 31 ++++ src/utils/project.cpp | 39 +++++ src/utils/project.hpp | 22 +++ 20 files changed, 796 insertions(+), 61 deletions(-) create mode 100644 cmake/CoinUtils/coin_utils_configure.sh create mode 100644 src/descriptor_identifier/Model/ModelLogRegressor.cpp create mode 100644 src/descriptor_identifier/Model/ModelLogRegressor.hpp create mode 100644 src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.cpp create mode 100644 src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp create mode 100644 src/python/descriptor_identifier/SISSOLogRegressor.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b4671acd..9044313d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -281,13 +281,11 @@ list(GET MPI_CXX_LIBRARIES 0 MPI_LIBRARY) get_filename_component(MPI_DIR ${MPI_LIBRARY} DIRECTORY) # Coin-Clp for linear programing -set(COIN_CLP_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/coin-Clp/build/") -set(COIN_CLP_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/coin-Clp/bin/") -set(COIN_CLP_INCLUDE_DIRS "${COIN_CLP_INSTALL_DIR}/include/;${COIN_CLP_INSTALL_DIR}/include/coin") -set(COIN_CLP_LIBRARY_DIRS "${COIN_CLP_INSTALL_DIR}/lib") -set(COIN_CLP_BLAS_LAPACK_LIBS "-L${LAPACK_DIR}") -set(COIN_CLP_URL "https://www.coin-or.org/download/source/Clp/Clp-1.17.6.tgz") - +set(COIN_UTILS_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/CoinUtils/build/") +set(COIN_UTILS_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/CoinUtils/bin/") +set(COIN_UTILS_INCLUDE_DIRS "${COIN_UTILS_INSTALL_DIR}/include/coin/;${COIN_UTILS_INSTALL_DIR}/include/") +set(COIN_UTILS_LIBRARY_DIRS "${COIN_UTILS_INSTALL_DIR}/lib/") +set(COIN_UTILS_BLAS_LAPACK_LIBS "-L${LAPACK_DIR}") foreach(LAPACK_LIB_FILE IN LISTS LAPACK_LIBRARIES) get_filename_component(LAPACK_LIB ${LAPACK_LIB_FILE} NAME_WE) string(REPLACE "lib" "-l" LAPACK_LIB ${LAPACK_LIB}) @@ -295,15 +293,38 @@ foreach(LAPACK_LIB_FILE IN LISTS LAPACK_LIBRARIES) endforeach() set(COIN_CLP_BLAS_LAPACK_LIBS "${COIN_CLP_BLAS_LAPACK_LIBS}") message(STATUS "COIN_CLP_BLAS_LAPACK_LIBS = ${COIN_CLP_BLAS_LAPACK_LIBS}") -set(COIN_CLP_CONFIGURE_COMMAND bash ${CMAKE_CURRENT_LIST_DIR}/cmake/coin-Clp/clp_configure.sh ${COIN_CLP_INSTALL_DIR} ${COIN_CLP_BLAS_LAPACK_LIBS} ${CMAKE_CXX_COMPILER}) +set(COIN_UTILS_CONFIGURE_COMMAND bash ${CMAKE_CURRENT_LIST_DIR}/cmake/CoinUtils/coin_utils_configure.sh ${COIN_UTILS_INSTALL_DIR} ${COIN_CLP_BLAS_LAPACK_LIBS} ${CMAKE_CXX_COMPILER}) + +ExternalProject_Add( + external_CoinUtils + PREFIX "external/CoinUtils" + GIT_REPOSITORY "https://github.com/coin-or/CoinUtils.git" + GIT_TAG "releases/2.11.4" + CONFIGURE_COMMAND "${COIN_UTILS_CONFIGURE_COMMAND}" + BUILD_COMMAND make -j ${BOOST_BUILD_N_PROCS} + INSTALL_COMMAND make install + BINARY_DIR "${COIN_UTILS_BUILD_DIR}" + INSTALL_DIR "${COIN_UTILS_INSTALL_DIR}" +) + +add_library( CoinUtils SHARED IMPORTED ) +set_property( TARGET CoinUtils PROPERTY IMPORTED_LOCATION ${COIN_UTILS_LIBRARY_DIRS}/libCoinUtils.so ) +set_property( TARGET CoinUtils PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${COIN_UTILS_INCLUDE_DIRS} ) +ExternalProject_Add_StepDependencies(external_CoinUtils CoinUtils) + +include_directories(${COIN_UTILS_INCLUDE_DIRS}) + +set(COIN_CLP_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/coin-Clp/build/") +set(COIN_CLP_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/coin-Clp/bin/") +set(COIN_CLP_INCLUDE_DIRS "${COIN_CLP_INSTALL_DIR}/include/;${COIN_CLP_INSTALL_DIR}/include/coin") +set(COIN_CLP_LIBRARY_DIRS "${COIN_CLP_INSTALL_DIR}/lib") +set(COIN_CLP_CONFIGURE_COMMAND bash ${CMAKE_CURRENT_LIST_DIR}/cmake/coin-Clp/clp_configure.sh ${COIN_CLP_INSTALL_DIR} ${COIN_CLP_BLAS_LAPACK_LIBS} ${CMAKE_CXX_COMPILER} ${COIN_UTILS_INSTALL_DIR}) ExternalProject_Add( external_Clp PREFIX "external/coin-Clp" - URL ${COIN_CLP_URL} - # SVN_REPOSITORY "https://projects.coin-or.org/svn/Clp/stable/1.17" - # BUILD_IN_SOURCE 1 - # CONFIGURE_COMMAND "${CMAKE_CURRENT_BINARY_DIR}/external/coin-Clp/src/external_Clp/configure" + GIT_REPOSITORY "https://github.com/coin-or/Clp.git" + GIT_TAG "releases/1.17.6" CONFIGURE_COMMAND "${COIN_CLP_CONFIGURE_COMMAND}" BUILD_COMMAND make -j ${BOOST_BUILD_N_PROCS} INSTALL_COMMAND make install @@ -314,14 +335,11 @@ ExternalProject_Add( add_library( Clp SHARED IMPORTED ) set_property( TARGET Clp PROPERTY IMPORTED_LOCATION ${COIN_CLP_LIBRARY_DIRS}/libClp.so ) set_property( TARGET Clp PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${COIN_CLP_INCLUDE_DIRS} ) +ExternalProject_Add_StepDependencies(external_CoinUtils external_Clp) ExternalProject_Add_StepDependencies(external_Clp Clp) -add_library( CoinUtils SHARED IMPORTED ) -set_property( TARGET CoinUtils PROPERTY IMPORTED_LOCATION ${COIN_CLP_LIBRARY_DIRS}/libCoinUtils.so ) -set_property( TARGET CoinUtils PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${COIN_CLP_INCLUDE_DIRS} ) -ExternalProject_Add_StepDependencies(external_Clp CoinUtils) -set(COIN_CLP_LIBRARIES "${COIN_CLP_LIBRARY_DIRS}/libClp.so;${COIN_CLP_LIBRARY_DIRS}/libCoinUtils.so") +set(COIN_CLP_LIBRARIES "${COIN_CLP_LIBRARY_DIRS}/libClp.so;${COIN_UTILS_LIBRARY_DIRS}/libCoinUtils.so") include_directories(${COIN_CLP_INCLUDE_DIRS}) include_directories(${CMAKE_CURRENT_LIST_DIR}/src) add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src) diff --git a/cmake/CoinUtils/coin_utils_configure.sh b/cmake/CoinUtils/coin_utils_configure.sh new file mode 100644 index 00000000..8ea6d6c8 --- /dev/null +++ b/cmake/CoinUtils/coin_utils_configure.sh @@ -0,0 +1,2 @@ +#! /usr/bin/bash +../src/external_CoinUtils/configure -C CXX=$3 --prefix=$1 --with-lapack-lflags="$2" diff --git a/cmake/coin-Clp/clp_configure.sh b/cmake/coin-Clp/clp_configure.sh index 6163e8fe..95478a98 100644 --- a/cmake/coin-Clp/clp_configure.sh +++ b/cmake/coin-Clp/clp_configure.sh @@ -1,2 +1,2 @@ #! /usr/bin/bash -../src/external_Clp/configure -C CXX=$3 --prefix=$1 --with-lapack-lib="$2" --with-blas-lib="$2" +../src/external_Clp/configure -C CXX=$3 --prefix=$1 --with-lapack-lib="$2" --with-blas-lib="$2" --with-coinutils-lib="-L$4/lib -lCoinUtils" --with-coinutils-incdir="$4/include/coin/" \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 90cbbe94..baec9eee 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -13,7 +13,7 @@ include_directories(${CMAKE_CURRENT_LIST_DIR}/python/) include_directories(${CMAKE_CURRENT_LIST_DIR}/utils/) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations") -set(CMAKE_INSTALL_RPATH ${Boost_LIBRARY_DIRS};${LAPACK_DIR};${MPI_DIR};${COIN_CLP_LIBRARY_DIRS}) +set(CMAKE_INSTALL_RPATH ${Boost_LIBRARY_DIRS};${LAPACK_DIR};${MPI_DIR};${COIN_CLP_LIBRARY_DIRS};${COIN_UTILS_LIBRARY_DIRS}) # set(INSTALL_RPATH ${Boost_LIB_DIR}) file(GLOB_RECURSE SISSOPP_SOURCES *.cpp) @@ -36,7 +36,7 @@ install(TARGETS sisso++ DESTINATION ${CMAKE_CURRENT_LIST_DIR}/../bin/) if(USE_PYTHON) include(${CMAKE_CURRENT_LIST_DIR}/../cmake/TransferDocStrings.cmake) - set(CMAKE_INSTALL_RPATH "${Boost_LIBRARY_DIRS};${PYTHON_PREFIX}/lib/;${MPI_DIR};${COIN_CLP_LIBRARY_DIRS}") + set(CMAKE_INSTALL_RPATH "${Boost_LIBRARY_DIRS};${PYTHON_PREFIX}/lib/;${MPI_DIR};${COIN_CLP_LIBRARY_DIRS};${COIN_UTILS_LIBRARY_DIRS}") transfer_doc_string(${CMAKE_CURRENT_LIST_DIR}/python/bindings_docstring_keyed.cpp ${CMAKE_CURRENT_LIST_DIR}/python/bindings.cpp) transfer_doc_string(${CMAKE_CURRENT_LIST_DIR}/python/bindings_docstring_keyed.hpp ${CMAKE_CURRENT_LIST_DIR}/python/bindings.hpp) diff --git a/src/descriptor_identifier/Model/ModelLogRegressor.cpp b/src/descriptor_identifier/Model/ModelLogRegressor.cpp new file mode 100644 index 00000000..969edca5 --- /dev/null +++ b/src/descriptor_identifier/Model/ModelLogRegressor.cpp @@ -0,0 +1,116 @@ +#include <descriptor_identifier/Model/ModelLogRegressor.hpp> + +ModelLogRegressor::ModelLogRegressor(Unit prop_unit, std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept) : + ModelRegressor(prop_unit, prop_train, prop_test, feats, task_sizes_train, task_sizes_test, fix_intercept), + _log_prop_train(_n_samp_train, 0.0), + _log_prop_test(_n_samp_test, 0.0), + _log_prop_train_est(_n_samp_train, 0.0), + _log_prop_test_est(_n_samp_test, 0.0), + _log_train_error(_n_samp_train), + _log_test_error(_n_samp_test) +{ + std::transform(_prop_train.begin(), _prop_train.end(), _log_prop_train.begin(),[](double p){return std::log(p);}); + std::transform(_prop_test.begin(), _prop_test.end(), _log_prop_test.begin(),[](double p){return std::log(p);}); + + + _log_prop_train.reserve(_n_samp_train); + _log_prop_test.reserve(_n_samp_test); + + _log_prop_train_est.reserve(_n_samp_train); + _log_prop_test_est.reserve(_n_samp_test); + + std::vector<double> a(_D_train.size(), 1.0); + std::vector<double> work(_D_train.size(), 0.0); + + int info; + int start = 0; + for(int tt = 0; tt < _task_sizes_train.size(); ++tt) + { + int sz = _task_sizes_train[tt]; + std::fill_n(a.data(), a.size(), 1.0); + std::fill_n(_D_train.data(), _D_train.size(), 1.0); + + for(int ff = 0; ff < feats.size(); ++ff) + { + std::transform(feats[ff]->value_ptr() + start, feats[ff]->value_ptr() + start + sz, _D_train.data() + ff * sz, [](double fv){return std::log(fv);}); + std::transform(feats[ff]->value_ptr() + start, feats[ff]->value_ptr() + start + sz, a.data() + ff * sz, [](double fv){return std::log(fv);}); + } + + dgels_('N', sz, _n_dim + 1 - _fix_intercept, 1, a.data(), sz, _log_prop_train.data() + start, sz, work.data(), work.size(), &info); + + std::copy_n(_log_prop_train.begin() + start, _n_dim + 1 - _fix_intercept, _coefs[tt].data()); + std::transform(_prop_train.begin() + start, _prop_train.begin() + start + sz, _log_prop_train.begin() + start, [](double p){return std::log(p);}); + + dgemv_('N', sz, _n_dim + 1 - _fix_intercept, 1.0, _D_train.data(), sz, _coefs.back().data(), 1, 0.0, _log_prop_train_est.data() + start, 1); + std::transform(_log_prop_train_est.begin() + start, _log_prop_train_est.begin() + start + sz, _log_prop_train.data() + start, _log_train_error.data() + start, std::minus<double>()); + + start += sz; + } + + start = 0; + int ii = 0; + for(auto& sz : _task_sizes_test) + { + if(sz > 0) + { + for(int ff = 0; ff < feats.size(); ++ff) + std::transform(feats[ff]->test_value_ptr() + start, feats[ff]->test_value_ptr() + start + sz, _D_test.data() + ff * sz, [](double fv){return std::log(fv);}); + + std::fill_n(_D_test.data() + feats.size() * sz, sz, 1.0); + dgemv_('N', sz, _n_dim + 1 - _fix_intercept, 1.0, _D_test.data(), sz, _coefs[ii].data(), 1, 0.0, _log_prop_test_est.data() + start, 1); + std::transform(_log_prop_test_est.begin() + start, _log_prop_test_est.begin() + start + sz, _log_prop_test.data() + start, _log_test_error.data() + start, std::minus<double>()); + } + ++ii; + start += sz; + } + + std::transform(_log_prop_train_est.begin(), _log_prop_train_est.end(), _prop_train_est.begin(), [](double lp){return std::exp(lp);}); + std::transform(_log_prop_test_est.begin(), _log_prop_test_est.end(), _prop_test_est.begin(), [](double lp){return std::exp(lp);}); + + std::transform(_prop_train.begin(), _prop_train.end(), _prop_train_est.begin(), _train_error.begin(), [](double pt, double pt_est){return pt - pt_est;}); + std::transform(_prop_test.begin(), _prop_test.end(), _prop_test_est.begin(), _test_error.begin(), [](double pt, double pt_est){return pt - pt_est;}); +} + +ModelLogRegressor::ModelLogRegressor(std::string train_file) : + ModelRegressor(train_file) +{ + std::transform(_D_train.begin(), _D_train.begin() + _n_samp_train, _D_train.begin(), [](double dt){return std::log(dt);}); + std::transform(_prop_train.begin(), _prop_train.end(), _log_prop_train.begin(), [](double pt){return std::log(pt);}); + std::transform(_prop_train_est.begin(), _prop_train_est.end(), _log_prop_train_est.begin(), [](double pt){return std::log(pt);}); + std::transform(_prop_train.begin(), _prop_train.end(), _prop_train_est.begin(), _train_error.begin(), [](double pt, double pt_est){return pt - pt_est;}); + std::transform(_log_prop_train.begin(), _log_prop_train.end(), _log_prop_train_est.begin(), _log_train_error.begin(), [](double pt, double pt_est){return pt - pt_est;}); +} + +ModelLogRegressor::ModelLogRegressor(std::string train_file, std::string test_file) : + ModelRegressor(train_file, test_file) +{ + std::transform(_D_train.begin(), _D_train.begin() + _n_samp_train, _D_train.begin(), [](double dt){return std::log(dt);}); + std::transform(_D_test.begin(), _D_test.begin() + _n_samp_test, _D_test.begin(), [](double dt){return std::log(dt);}); + + std::transform(_prop_train.begin(), _prop_train.end(), _log_prop_train.begin(), [](double pt){return std::log(pt);}); + std::transform(_prop_test.begin(), _prop_test.end(), _log_prop_test.begin(), [](double pt){return std::log(pt);}); + + std::transform(_prop_train_est.begin(), _prop_train_est.end(), _log_prop_train_est.begin(), [](double pt){return std::log(pt);}); + std::transform(_prop_test_est.begin(), _prop_test_est.end(), _log_prop_test_est.begin(), [](double pt){return std::log(pt);}); + + std::transform(_prop_train.begin(), _prop_train.end(), _prop_train_est.begin(), _train_error.begin(), [](double pt, double pt_est){return pt - pt_est;}); + std::transform(_prop_test.begin(), _prop_test.end(), _prop_test_est.begin(), _test_error.begin(), [](double pt, double pt_est){return pt - pt_est;}); + + std::transform(_log_prop_train.begin(), _log_prop_train.end(), _log_prop_train_est.begin(), _log_train_error.begin(), [](double pt, double pt_est){return pt - pt_est;}); + std::transform(_log_prop_test.begin(), _log_prop_test.end(), _log_prop_test_est.begin(), _log_test_error.begin(), [](double pt, double pt_est){return pt - pt_est;}); +} + +std::string ModelLogRegressor::toString() const +{ + std::stringstream unit_rep; + unit_rep << "c0"; + for(int ff = 0; ff < _feats.size(); ++ff) + unit_rep << " * (" << _feats[ff]->expr() << ")^a" << ff; + return unit_rep.str(); +} + +std::ostream& operator<< (std::ostream& outStream, const ModelLogRegressor& model) +{ + outStream << model.toString(); + return outStream; +} \ No newline at end of file diff --git a/src/descriptor_identifier/Model/ModelLogRegressor.hpp b/src/descriptor_identifier/Model/ModelLogRegressor.hpp new file mode 100644 index 00000000..b54ec3a1 --- /dev/null +++ b/src/descriptor_identifier/Model/ModelLogRegressor.hpp @@ -0,0 +1,122 @@ +/** @file descriptor_identifier/ModelLogRegressor/ModelLogRegressor.hpp + * @brief Object to store the models generated form SISSO + * + * Creates a ModelLogRegressor generated from SISSO and the corresponding output file. + * It also has functionality to read in an output file to regenerate the model. + * + * @author Thomas A. R. Purcell (tpurcell) + * @bug No known bugs. + */ +#ifndef MODEL_LOG_REGRESSOR +#define MODEL_LOG_REGRESSOR + +#include <descriptor_identifier/Model/ModelRegressor.hpp> + +#ifdef PY_BINDINGS + namespace np = boost::python::numpy; + namespace py = boost::python; +#endif + +typedef std::shared_ptr<ModelNode> model_node_ptr; + +// DocString: cls_model_reg +/** + * @brief Class to store the models found from SISSO + * + */ +class ModelLogRegressor : public ModelRegressor +{ + std::vector<double> _log_prop_train; //!< The log of the real Property + std::vector<double> _log_prop_test; //!< The log of the real Property + + std::vector<double> _log_prop_train_est; //!< The estimated Property (training data) + std::vector<double> _log_prop_test_est; //!< The estimated Property (testing data) + + std::vector<double> _log_train_error; //!< The error of the model (training) + std::vector<double> _log_test_error; //!< The error of the model (testing) + +public: + /** + * @brief Constructor for the model + * + * @param prop_unit The unit of the property + * @param prop_train The property vector for the training samples + * @param prop_test The property vector for the test samples + * @param feats The features for the model + * @param task_sizes_train Number of samples per task in the training data + * @param task_sizes_test Number of samples per task in the test data + */ + ModelLogRegressor(Unit prop_unit, std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept); + + // DocString: model_reg_init_str + /** + * @brief Construct a model from a training output file + * @details Reads in all of the data from the output file and recreates the model object + * + * @param train_file Previously generated model file + */ + ModelLogRegressor(std::string train_file); + + // DocString: model_reg_init_str_str + /** + * @brief Construct a model from a training and testing output file + * @details Reads in all of the data from the output files and recreates the model object + * + * @param train_file Previously generated training model output file + * @param train_file Previously generated testing model output file + */ + ModelLogRegressor(std::string train_file, std::string test_file); + + /** + * @brief Copy Constructor + * + * @param o ModelLogRegressor to be copied + */ + ModelLogRegressor(const ModelLogRegressor& o) = default; + + /** + * @brief Move Constructor + * + * @param o ModelLogRegressor to be moved + */ + ModelLogRegressor(ModelLogRegressor&& o) = default; + + /** + * @brief Copy Assignment operator + * + * @param o ModelLogRegressor to be copied + */ + ModelLogRegressor& operator= (const ModelLogRegressor& o) = default; + + /** + * @brief Move Assignment operator + * + * @param o ModelLogRegressor to be moved + */ + ModelLogRegressor& operator= (ModelLogRegressor&& o) = default; + + // DocString: model_reg_str + /** + * @brief Convert the model to a string + + * @return The string representation of the model + */ + std::string toString() const; + + /** + * @brief Copy the error into a new array + * + * @param res pointer to the beginning of the vector to store the residual + */ + inline void copy_error(double* res){std::copy_n(_log_train_error.data(), _n_samp_train, res);} +}; + +/** + * @brief Print a model to an string stream + * + * @param outStream The output stream the model is to be printed + * @param model The model to be printed + */ +std::ostream& operator<< (std::ostream& outStream, const ModelLogRegressor& model); + +#endif \ No newline at end of file diff --git a/src/descriptor_identifier/Model/ModelRegressor.hpp b/src/descriptor_identifier/Model/ModelRegressor.hpp index 4a28d58f..be1419b2 100644 --- a/src/descriptor_identifier/Model/ModelRegressor.hpp +++ b/src/descriptor_identifier/Model/ModelRegressor.hpp @@ -36,6 +36,7 @@ typedef std::shared_ptr<ModelNode> model_node_ptr; */ class ModelRegressor : public Model { +protected: std::vector<double> _prop_train_est; //!< The estimated Property (training data) std::vector<double> _prop_test_est; //!< The estimated Property (testing data) @@ -119,14 +120,14 @@ public: * @return The string representation of the model */ - std::string toString() const; + virtual std::string toString() const; /** * @brief Copy the error into a new array * * @param res pointer to the beginning of the vector to store the residual */ - inline void copy_error(double* res){std::copy_n(_train_error.data(), _n_samp_train, res);} + virtual inline void copy_error(double* res){std::copy_n(_train_error.data(), _n_samp_train, res);} // DocString: model_reg_rmse /** diff --git a/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.cpp b/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.cpp new file mode 100644 index 00000000..d2aa4206 --- /dev/null +++ b/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.cpp @@ -0,0 +1,69 @@ +#include <descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp> + +SISSOLogRegressor::SISSOLogRegressor( + std::shared_ptr<FeatureSpace> feat_space, + Unit prop_unit, + std::vector<double> prop, + std::vector<double> prop_test, + std::vector<int> task_sizes_train, + std::vector<int> task_sizes_test, + std::vector<int> leave_out_inds, + int n_dim, + int n_residual, + int n_models_store, + bool fix_intercept +): + SISSORegressor(feat_space, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds, n_dim, n_residual, n_models_store, fix_intercept), + _prop_no_log(prop) +{ + if(*std::min_element(prop.begin(), prop.end()) <= 0.0) + throw std::logic_error("Requesting to preform a Log Regression on a property that has values <= 0.0"); + std::transform(_prop.begin(), _prop.end(), _prop.begin(), [](double p){return std::log(p);}); +} + +void SISSOLogRegressor::set_a(std::vector<int>& inds, int start, int n_samp, double* a) +{ + for(int ii = 0; ii < inds.size(); ++ii) + std::transform(node_value_arrs::get_d_matrix_ptr(inds[ii]) + start, node_value_arrs::get_d_matrix_ptr(inds[ii]) + start + n_samp, a + ii * n_samp, [](double v){return std::log(v);}); + + if(!_fix_intercept) + std::fill_n(a + inds.size() * n_samp, n_samp, 1.0); +} + +void SISSOLogRegressor::set_error(std::vector<int>& inds, double* coefs, int start, int n_samp, double* error) +{ + std::fill_n(error + start, n_samp, -1 * (!_fix_intercept) * coefs[inds.size()]); + for(int ii = 0; ii < inds.size(); ++ii) + std::transform(node_value_arrs::get_d_matrix_ptr(inds[ii]) + start, node_value_arrs::get_d_matrix_ptr(inds[ii]) + start + n_samp, error + start, error + start, [&coefs, &ii](double fv, double e){return e - coefs[ii] * std::log(fv);}); + + daxpy_(n_samp, 1.0, _prop.data() + start, 1, error + start, 1); +} + +void SISSOLogRegressor::output_models(std::vector<double>& residual) +{ + if(_mpi_comm->rank() == 0) + { + for(int rr = 0; rr < _n_models_store; ++rr) + { + _models.back()[rr].to_file("models/train_dim_" + std::to_string(_models.size()) + "_model_" + std::to_string(rr) + ".dat"); + if(_leave_out_inds.size() > 0) + _models.back()[rr].to_file("models/test_dim_" + std::to_string(_models.size()) + "_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds); + } + } + for(int rr = 0; rr < _n_residual; ++rr) + _models.back()[rr].copy_error(&residual[rr * _n_samp]); +} + +void SISSOLogRegressor::add_model(std::vector<int> indexes) +{ + if(_models.size() < indexes.size()) + _models.push_back({}); + std::vector<model_node_ptr> min_nodes(indexes.size()); + for(int ii = 0; ii < indexes.size(); ++ii) + { + int index = indexes[ii]; + min_nodes[ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]->arr_ind(), _feat_space->phi_selected()[index]->rung(), _feat_space->phi_selected()[index]->expr(), _feat_space->phi_selected()[index]->postfix_expr(), _feat_space->phi_selected()[index]->value(), _feat_space->phi_selected()[index]->test_value(), _feat_space->phi_selected()[index]->unit()); + } + ModelLogRegressor model(_prop_unit, _prop_no_log, _prop_test, min_nodes, _task_sizes_train, _task_sizes_test, _fix_intercept); + _models.back().push_back(model); +} diff --git a/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp b/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp new file mode 100644 index 00000000..b764149c --- /dev/null +++ b/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp @@ -0,0 +1,157 @@ +/** @file descriptor_identifier/SISSOLogRegressor.hpp + * @brief Perform SISSO on a previously generated Feature Space + * + * Takes in a feature space and performs SISSO on it while storing the selected features in _models + * + * @author Thomas A. R. Purcell (tpurcell) + * @bug No known bugs. + */ +#ifndef SISSO_LOG_REGRESSOR +#define SISSO_LOG_REGRESSOR + +#include <descriptor_identifier/Model/ModelLogRegressor.hpp> +#include <descriptor_identifier/SISSO_DI/SISSORegressor.hpp> + +#ifdef PY_BINDINGS + namespace np = boost::python::numpy; + namespace py = boost::python; +#endif + +// DocString: cls_sisso_log_reg +/** + * @brief SISSO Regressor class, performs the SISSO algorithm and stores all selected models + * + */ +class SISSOLogRegressor: public SISSORegressor +{ +private: + std::vector<std::vector<ModelLogRegressor>> _models; //!< List of models + +protected: + std::vector<double> _prop_no_log; //!< Copy of the property vector without taking the log. + +public: + /** + * @brief Constructor for the Regressor + * + * @param feat_space The feature space to run SISSO on + * @param prop_unit The unit of the property + * @param prop Vector storing all data to train the SISSO models with + * @param prpo_test Vector storing all data to test the SISSO models with + * @param task_sizes_train Number of training samples per task + * @param task_sizes_test Number of testing samples per task + * @param leave_out_inds List of indexes from the initial data file in the test set + * @param n_dim Maximum dimensionality of the generated models + * @param n_residual Number of residuals to pass to the next SIS operation + * @param n_models_store Number of features to store in files + * @param fix_intrecept If true fix intercept to 0 + */ + SISSOLogRegressor(std::shared_ptr<FeatureSpace> feat_space, Unit prop_unit, std::vector<double> prop, std::vector<double> prop_test, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, std::vector<int> leave_out_inds, int n_dim, int n_residual, int n_models_store, bool fix_intercept=false); + + /** + * @brief Set the A matrix for the least squares problem + * + * @param inds indexes of the selected features + * @param start The index in the property and feature vectors start copying into _b and _a + * @param n_samp number of samples to perform least squares optimization on + */ + void set_a(std::vector<int>& inds, int start, int n_samp, double* a); + + /** + * @brief Set the residual for the next step + * + * @param inds indexes of the selected features + * @param coeffs Coefficients of the model + * @param start The index in the property and feature vectors start copying into _b and _a + * @param n_samp number of samples to perform least squares optimization on + */ + void set_error(std::vector<int>& inds, double* coefs, int start, int n_samp, double* error); + + void add_model(std::vector<int> indexes); + + void output_models(std::vector<double>& residual); + + /** + * @brief Preform the l0 normalization for a property or the residual + * + * @param prop The property to fit + * @param n_dim the dimensionality of the model + */ + // void l0_norm(std::vector<double>& prop, int n_dim); + + /** + * @brief Acessor function for models + */ + inline std::vector<std::vector<ModelLogRegressor>> models(){return _models;} + + // Python interface functions + #ifdef PY_BINDINGS + /** + * @brief Constructor for the Regressor that takes in python objects (cpp definition in <python/descriptor_identifier/SISSOLogRegressor.cpp) + * + * @param feat_space The feature space to run SISSO on + * @param prop_unit The unit of the property + * @param prop Vector storing all data to train the SISSO models with + * @param prpo_test Vector storing all data to test the SISSO models with + * @param task_sizes_train Number of training samples per task + * @param task_sizes_test Number of testing samples per task + * @param leave_out_inds List of indexes from the initial data file in the test set + * @param n_dim Maximum dimensionality of the generated models + * @param n_residual Number of residuals to pass to the next SIS operation + * @param n_models_store Number of features to store in files + * @param fix_intrecept If true fix intercept to 0 + */ + SISSOLogRegressor( + std::shared_ptr<FeatureSpace> feat_space, + Unit prop_unit, + np::ndarray prop, + np::ndarray prop_test, + py::list task_sizes_train, + py::list task_sizes_test, + py::list leave_out_inds, + int n_dim, + int n_residual, + int n_models_store, + bool fix_intercept=false + ); + + // DocString: sisso_log_reg_init_list + /** + * @brief Constructor for the Regressor that takes in python objects (cpp definition in <python/descriptor_identifier/SISSOLogRegressor.cpp) + * + * @param feat_space The feature space to run SISSO on + * @param prop_unit The unit of the property + * @param prop Vector storing all data to train the SISSO models with + * @param prpo_test Vector storing all data to test the SISSO models with + * @param task_sizes_train Number of training samples per task + * @param task_sizes_test Number of testing samples per task + * @param leave_out_inds List of indexes from the initial data file in the test set + * @param n_dim Maximum dimensionality of the generated models + * @param n_residual Number of residuals to pass to the next SIS operation + * @param n_models_store Number of features to store in files + * @param fix_intrecept If true fix intercept to 0 + */ + SISSOLogRegressor( + std::shared_ptr<FeatureSpace> feat_space, + Unit prop_unit, + py::list prop, + py::list prop_test, + py::list task_sizes_train, + py::list task_sizes_test, + py::list leave_out_inds, + int n_dim, + int n_residual, + int n_models_store, + bool fix_intercept=false + ); + + // DocString: sisso_log_reg_models_py + /** + * @brief The selected models (cpp definition in <python/descriptor_identifier/SISSO_DI.cpp) + * @return models as a python list + */ + py::list models_py(); + #endif +}; + +#endif diff --git a/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp b/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp index 1397e9da..d1bafb75 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp +++ b/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp @@ -70,6 +70,35 @@ void SISSORegressor::set_error(std::vector<int>& inds, double* coefs, int start, } +void SISSORegressor::add_model(std::vector<int> indexes) +{ + if(_models.size() < indexes.size()) + _models.push_back({}); + + std::vector<model_node_ptr> min_nodes(indexes.size()); + for(int ii = 0; ii < indexes.size(); ++ii) + { + int index = indexes[ii]; + min_nodes[ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]->arr_ind(), _feat_space->phi_selected()[index]->rung(), _feat_space->phi_selected()[index]->expr(), _feat_space->phi_selected()[index]->postfix_expr(), _feat_space->phi_selected()[index]->value(), _feat_space->phi_selected()[index]->test_value(), _feat_space->phi_selected()[index]->unit()); + } + _models.back().push_back(ModelRegressor(_prop_unit, _prop, _prop_test, min_nodes, _task_sizes_train, _task_sizes_test, _fix_intercept)); +} + +void SISSORegressor::output_models(std::vector<double>& residual) +{ + if(_mpi_comm->rank() == 0) + { + for(int rr = 0; rr < _n_models_store; ++rr) + { + _models.back()[rr].to_file("models/train_dim_" + std::to_string(_models.size()) + "_model_" + std::to_string(rr) + ".dat"); + if(_leave_out_inds.size() > 0) + _models.back()[rr].to_file("models/test_dim_" + std::to_string(_models.size()) + "_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds); + } + } + for(int rr = 0; rr < _n_residual; ++rr) + _models.back()[rr].copy_error(&residual[rr * _n_samp]); +} + void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) { int n_get_models = std::max(_n_residual, _n_models_store); @@ -90,8 +119,6 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) } n_interactions /= n_dim_fact; - util_funcs::iterate(inds, inds.size(), _mpi_comm->rank()); - if(inds.back() >= 0) { #pragma omp parallel shared(min_inds, min_errors, n_interactions, n_get_models, n_dim) firstprivate(inds, coefs) @@ -110,10 +137,9 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) int ii_prev = 0; #pragma omp for schedule(dynamic) - for(int ii = 0; ii < n_interactions; ii += _mpi_comm->size()) + for(int ii = _mpi_comm->rank(); ii < n_interactions; ii += _mpi_comm->size()) { util_funcs::iterate(inds, inds.size(), ii - ii_prev); - int start = 0; for(auto& sz : _task_sizes_train) { @@ -122,15 +148,13 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) start += sz; } double rmse = std::sqrt(ddot_(_n_samp, error.data(), 1, error.data(), 1) / _n_samp); - #pragma omp critical + if(rmse < min_errors_private[max_error_ind]) { - if(rmse < min_errors_private[max_error_ind]) - { - min_errors_private[max_error_ind] = rmse; - std::copy_n(inds.begin(), inds.size(), min_inds_private.begin() + max_error_ind * n_dim); - max_error_ind = std::max_element(min_errors_private.begin(), min_errors_private.end()) - min_errors_private.begin(); - } + min_errors_private[max_error_ind] = rmse; + std::copy_n(inds.begin(), inds.size(), min_inds_private.begin() + max_error_ind * n_dim); + max_error_ind = std::max_element(min_errors_private.begin(), min_errors_private.end()) - min_errors_private.begin(); } + ii_prev = ii; } @@ -157,23 +181,15 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) inds = util_funcs::argsort(all_min_error); - std::vector<model_node_ptr> min_nodes(n_dim); - std::vector<ModelRegressor> models; - - // #pragma omp parallel for firstprivate(min_nodes) for(int rr = 0; rr < n_get_models; ++rr) { + std::vector<int> indexes(n_dim); node_value_arrs::clear_temp_test_reg(); for(int ii = 0; ii < n_dim; ++ii) - { - int index = all_min_inds[inds[rr] * n_dim + ii]; - min_nodes[ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]->arr_ind(), _feat_space->phi_selected()[index]->rung(), _feat_space->phi_selected()[index]->expr(), _feat_space->phi_selected()[index]->postfix_expr(), _feat_space->phi_selected()[index]->value(), _feat_space->phi_selected()[index]->test_value(), _feat_space->phi_selected()[index]->unit()); - } - // #pragma omp critical - models.push_back(ModelRegressor(_prop_unit, _prop, _prop_test, min_nodes, _task_sizes_train, _task_sizes_test, _fix_intercept)); - } + indexes[ii] = all_min_inds[inds[rr] * n_dim + ii]; - _models.push_back(models); + add_model(indexes); + } } @@ -198,18 +214,9 @@ void SISSORegressor::fit() _mpi_comm->barrier(); duration = duration = omp_get_wtime() - start; + if(_mpi_comm->rank() == 0) - { std::cout << "Time for l0-norm: " << duration << std::endl; - - for(int rr = 0; rr < _n_models_store; ++rr) - { - _models.back()[rr].to_file("models/train_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat"); - if(_leave_out_inds.size() > 0) - _models.back()[rr].to_file("models/test_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds); - } - } - for(int rr = 0; rr < _n_residual; ++rr) - _models.back()[rr].copy_error(&residual[rr * _n_samp]); + output_models(residual); } } diff --git a/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp b/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp index e5a31a74..1bb1c860 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp +++ b/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp @@ -26,10 +26,12 @@ */ class SISSORegressor: public SISSO_DI { -protected: +private: std::vector<std::vector<ModelRegressor>> _models; //!< List of models +protected: int _lwork; //!< size of the work array (for dgels_) + public: /** * @brief Constructor for the Regressor @@ -56,7 +58,7 @@ public: * @param start The index in the property and feature vectors start copying into _b and _a * @param n_samp number of samples to perform least squares optimization on */ - void set_error(std::vector<int>& inds, double* coefs, int start, int n_samp, double* error); + virtual void set_error(std::vector<int>& inds, double* coefs, int start, int n_samp, double* error); /** * @brief Get the optimal size of the working array @@ -83,7 +85,11 @@ public: * @param start The index in the property and feature vectors start copying into _b and _a * @param n_samp number of samples to perform least squares optimization on */ - void set_a(std::vector<int>& inds, int start, int n_samp, double* a); + virtual void set_a(std::vector<int>& inds, int start, int n_samp, double* a); + + virtual void add_model(std::vector<int> indexes); + + virtual void output_models(std::vector<double>& residual); /** * @brief Preform the l0 normalization for a property or the residual @@ -91,7 +97,7 @@ public: * @param prop The property to fit * @param n_dim the dimensionality of the model */ - void l0_norm(std::vector<double>& prop, int n_dim); + virtual void l0_norm(std::vector<double>& prop, int n_dim); // DocString: sisso_reg_fit /** @@ -171,7 +177,7 @@ public: * @brief The selected models (cpp definition in <python/descriptor_identifier/SISSO_DI.cpp) * @return models as a python list */ - py::list models_py(); + virtual py::list models_py(); #endif }; diff --git a/src/feature_creation/feature_space/FeatureSpace.cpp b/src/feature_creation/feature_space/FeatureSpace.cpp index 0b90a1fd..38e96cc9 100644 --- a/src/feature_creation/feature_space/FeatureSpace.cpp +++ b/src/feature_creation/feature_space/FeatureSpace.cpp @@ -89,6 +89,11 @@ void FeatureSpace::initialize_fs(std::vector<double> prop, std::string project_t _project = project_funcs::project_classify; _project_no_omp = project_funcs::project_classify_no_omp; } + else if(project_type.compare("log_regression") == 0) + { + _project = project_funcs::project_log_r2; + _project_no_omp = project_funcs::project_log_r2_no_omp; + } else throw std::logic_error("Wrong projection type passed to FeatureSpace constructor."); diff --git a/src/main.cpp b/src/main.cpp index 09a08c4e..b2572680 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,5 +1,6 @@ #include <inputs/InputParser.hpp> #include <descriptor_identifier/SISSO_DI/SISSORegressor.hpp> +#include <descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp> #include <descriptor_identifier/SISSO_DI/SISSOClassifier.hpp> #include <iostream> @@ -54,6 +55,24 @@ int main(int argc, char const *argv[]) } } } + if(IP._calc_type.compare("log_regression") == 0) + { + SISSOLogRegressor sisso(IP._feat_space, IP._prop_unit, IP._prop_train, IP._prop_test, IP._task_sizes_train, IP._task_sizes_test, IP._leave_out_inds, IP._n_dim, IP._n_residuals, IP._n_models_store, IP._fix_intercept); + sisso.fit(); + + if(mpi_setup::comm->rank() == 0) + { + for(int ii = 0; ii < sisso.models().size(); ++ii) + { + std::cout << "Train RMSE: " << sisso.models()[ii][0].rmse(); + if(IP._prop_test.size() > 0) + std::cout << "; Test RMSE: " << sisso.models()[ii][0].test_rmse() << std::endl; + else + std::cout << std::endl; + std::cout << sisso.models()[ii][0] << "\n" << std::endl; + } + } + } else if(IP._calc_type.compare("classification") == 0) { SISSOClassifier sisso(IP._feat_space, IP._prop_unit, IP._prop_train, IP._prop_test, IP._task_sizes_train, IP._task_sizes_test, IP._leave_out_inds, IP._n_dim, IP._n_residuals, IP._n_models_store); diff --git a/src/python/bindings_docstring_keyed.cpp b/src/python/bindings_docstring_keyed.cpp index c374b215..d11ff0b2 100644 --- a/src/python/bindings_docstring_keyed.cpp +++ b/src/python/bindings_docstring_keyed.cpp @@ -6,9 +6,11 @@ void sisso::register_all() { sisso::descriptor_identifier::registerModel(); sisso::descriptor_identifier::registerModelRegressor(); + sisso::descriptor_identifier::registerModelLogRegressor(); sisso::descriptor_identifier::registerModelClassifier(); sisso::descriptor_identifier::registerSISSO_DI(); sisso::descriptor_identifier::registerSISSORegressor(); + sisso::descriptor_identifier::registerSISSOLogRegressor(); sisso::descriptor_identifier::registerSISSOClassifier(); sisso::feature_creation::registerFeatureSpace(); sisso::feature_creation::registerUnit(); @@ -390,6 +392,16 @@ void sisso::descriptor_identifier::registerModelRegressor() ; } +void sisso::descriptor_identifier::registerModelLogRegressor() +{ + class_<ModelLogRegressor, bases<ModelRegressor>>("ModelLogRegressor", init<Unit, std::vector<double>, std::vector<double>, std::vector<model_node_ptr>, std::vector<int>, std::vector<int>, bool>()) + .def(init<std::string>()) + .def(init<std::string, std::string>()) + .def("__str__", &ModelRegressor::toString, "@DocString_model_reg_str@") + .def("__repr__", &ModelRegressor::toString, "@DocString_model_reg_str@") + ; +} + void sisso::descriptor_identifier::registerModelClassifier() { class_<ModelClassifier, bases<Model>>("ModelClassifier", init<Unit, std::vector<double>, std::vector<double>, std::vector<model_node_ptr>, std::vector<int>, std::vector<int>, bool>()) @@ -437,6 +449,15 @@ void sisso::descriptor_identifier::registerSISSORegressor() ; } +void sisso::descriptor_identifier::registerSISSOLogRegressor() +{ + class_<SISSOLogRegressor, bases<SISSORegressor>>("SISSOLogRegressor", init<std::shared_ptr<FeatureSpace>, Unit, np::ndarray, np::ndarray, py::list, py::list, py::list, int, int, int, optional<bool>>()) + .def("fit", &SISSOLogRegressor::fit, "@DocString_sisso_reg_fit@") + .def(init<std::shared_ptr<FeatureSpace>, Unit, py::list, py::list, py::list, py::list, py::list, int, int, int, optional<bool>>()) + .add_property("models", &SISSOLogRegressor::models_py, "@DocString_sisso_reg_models_py@") + ; +} + void sisso::descriptor_identifier::registerSISSOClassifier() { class_<SISSOClassifier, bases<SISSO_DI>>("SISSOClassifier", init<std::shared_ptr<FeatureSpace>, Unit, np::ndarray, np::ndarray, py::list, py::list, py::list, int, int, int>()) diff --git a/src/python/bindings_docstring_keyed.hpp b/src/python/bindings_docstring_keyed.hpp index c5297143..aab68b43 100644 --- a/src/python/bindings_docstring_keyed.hpp +++ b/src/python/bindings_docstring_keyed.hpp @@ -9,6 +9,7 @@ #include <descriptor_identifier/SISSO_DI/SISSO_DI.hpp> #include <descriptor_identifier/SISSO_DI/SISSORegressor.hpp> +#include <descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp> #include <descriptor_identifier/SISSO_DI/SISSOClassifier.hpp> #include <feature_creation/feature_space/FeatureSpace.hpp> #include <python/feature_creation/node_utils.hpp> @@ -219,6 +220,11 @@ namespace sisso */ static void registerModelRegressor(); + /** + * @brief Register the Model object for conversion to a python object + */ + static void registerModelLogRegressor(); + /** * @brief Register the Model object for conversion to a python object */ @@ -234,6 +240,10 @@ namespace sisso */ static void registerSISSORegressor(); + /** + * @brief Register the SISSORegressor object for conversion to a python object + */ + static void registerSISSOLogRegressor(); /** * @brief Register the SISSORegressor object for conversion to a python object diff --git a/src/python/descriptor_identifier/SISSOLogRegressor.cpp b/src/python/descriptor_identifier/SISSOLogRegressor.cpp new file mode 100644 index 00000000..c20a0351 --- /dev/null +++ b/src/python/descriptor_identifier/SISSOLogRegressor.cpp @@ -0,0 +1,57 @@ +#include <descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp> + +SISSOLogRegressor::SISSOLogRegressor( + std::shared_ptr<FeatureSpace> feat_space, + Unit prop_unit, + np::ndarray prop, + np::ndarray prop_test, + py::list task_sizes_train, + py::list task_sizes_test, + py::list leave_out_inds, + int n_dim, + int n_residual, + int n_models_store, + bool fix_intercept +) : + SISSORegressor(feat_space, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds, n_dim, n_residual, n_models_store, fix_intercept), + _prop_no_log(_prop) +{ + if(*std::min_element(_prop.begin(), _prop.end()) <= 0.0) + throw std::logic_error("Requesting to preform a Log Regression on a property that has values <= 0.0"); + std::transform(_prop.begin(), _prop.end(), _prop.begin(), [](double p){return std::log(p);}); + std::cout << _prop[0] << '\t' << _prop_no_log[0] << std::endl; +} + +SISSOLogRegressor::SISSOLogRegressor( + std::shared_ptr<FeatureSpace> feat_space, + Unit prop_unit, + py::list prop, + py::list prop_test, + py::list task_sizes_train, + py::list task_sizes_test, + py::list leave_out_inds, + int n_dim, + int n_residual, + int n_models_store, + bool fix_intercept +) : + SISSORegressor(feat_space, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds, n_dim, n_residual, n_models_store, fix_intercept), + _prop_no_log(_prop) +{ + if(*std::min_element(_prop.begin(), _prop.end()) <= 0.0) + throw std::logic_error("Requesting to preform a Log Regression on a property that has values <= 0.0"); + std::transform(_prop.begin(), _prop.end(), _prop.begin(), [](double p){return std::log(p);}); +} + +py::list SISSOLogRegressor::models_py() +{ + py::list model_list; + for(auto& mod_list : _models) + { + py::list lst; + for(auto& mod : mod_list) + lst.append<ModelLogRegressor>(mod); + model_list.append<py::list>(lst); + } + return model_list; +} diff --git a/src/utils/math_funcs.cpp b/src/utils/math_funcs.cpp index 2e6cd319..2a510498 100644 --- a/src/utils/math_funcs.cpp +++ b/src/utils/math_funcs.cpp @@ -18,6 +18,12 @@ bool util_funcs::iterate(std::vector<int>& inds, int size, int incriment) return cont; } +double util_funcs::log_r2(double* a, double* b, double* log_a, int size) +{ + std::transform(a, a + size, log_a, [](double aa){return std::log(aa);}); + std::pow((ddot_(size, log_a, 1, b, 1) - static_cast<double>(size) * mean(log_a, size) * mean(b, size)) / (static_cast<double>(size) * stand_dev(log_a, size) * stand_dev(b, size)), 2.0); +} + double util_funcs::r(double* a, double* b, std::vector<int>& sizes) { double result = 0.0; @@ -44,6 +50,19 @@ double util_funcs::r2(double* a, double* b, std::vector<int>& sizes) return result / sizes.size(); } +double util_funcs::log_r2(double* a, double* b, double* log_a, std::vector<int>& sizes) +{ + double result = 0.0; + int pos = 0; + for(auto sz : sizes) + { + result += log_r2(a + pos, b + pos, log_a + pos, sz); + pos += sz; + } + + return result / sizes.size(); +} + double util_funcs::r(double* a, double* b, int* sz, int n_task) { double result = 0.0; @@ -69,3 +88,17 @@ double util_funcs::r2(double* a, double* b, int* sz, int n_task) return result / static_cast<double>(n_task); } + +double util_funcs::log_r2(double* a, double* b, double* log_a, int* sz, int n_task) +{ + double result = 0.0; + int pos = 0; + for(int nt = 0; nt < n_task; ++nt) + { + result += log_r2(a + pos, b + pos, log_a + pos, sz[nt]); + pos += sz[nt]; + } + + return result / static_cast<double>(n_task); +} + diff --git a/src/utils/math_funcs.hpp b/src/utils/math_funcs.hpp index 5d613b94..51ba3fcc 100644 --- a/src/utils/math_funcs.hpp +++ b/src/utils/math_funcs.hpp @@ -189,6 +189,37 @@ namespace util_funcs */ double r2(double* a, double* b, int* sz, int n_tasks); + /** + * @brief Calculate the Coefficient of Determination between two vectors (For the log transformed problem) + * + * @param a the pointer to the head of the first vector + * @param b the pointer to the head of the second vector + * @param size the size of the vector + * @return The Coefficient of Determination + */ + double log_r2(double* a, double* b, double* log_a, int size); + + /** + * @brief Calculate the average Coefficient of Determination between two vectors (For the log transformed problem) + * + * @param a the pointer to the head of the first vector + * @param b the pointer to the head of the second vector + * @param sizes the sizes of the tasks to calculate the correlation on + * @return The average Coefficient of Determination + */ + double log_r2(double* a, double* b, double* log_a, std::vector<int>& sizes); + + /** + * @brief Calculate the average Coefficient of Determination between two vectors (For the log transformed problem) + * + * @param a the pointer to the head of the first vector + * @param b the pointer to the head of the second vector + * @param sz the start of vector that describes the sizes of the tasks to calculate the correlation on + * @param n_tasks number of tasks to average over + * @return The average Coefficient of Determination + */ + double log_r2(double* a, double* b, double* log_a, int* sz, int n_tasks); + /** * @brief Sort a vector and return the indexes of the unsorted array that corresponds to the sorted one * diff --git a/src/utils/project.cpp b/src/utils/project.cpp index 8c8a0510..fe05a8bf 100644 --- a/src/utils/project.cpp +++ b/src/utils/project.cpp @@ -44,6 +44,29 @@ void project_funcs::project_r2(double* prop, double* scores, std::vector<node_pt } } +void project_funcs::project_log_r2(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& sizes, int n_prop) +{ + int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); + + #pragma omp parallel firstprivate(prop) + { + int start = omp_get_thread_num() * (phi.size() / omp_get_num_threads()) + std::min(omp_get_thread_num(), static_cast<int>(phi.size()) % omp_get_num_threads()); + int end = (omp_get_thread_num() + 1) * (phi.size() / omp_get_num_threads()) + std::min(omp_get_thread_num() + 1, static_cast<int>(phi.size()) % omp_get_num_threads()); + + std::vector<double> log_feat(n_samp, 0.0); + std::transform(phi.begin() + start, phi.begin() + end, scores + start, [&prop, &sizes, &log_feat](node_ptr feat){return -1 * util_funcs::log_r2(prop, feat->value_ptr(), log_feat.data(), sizes.data(), sizes.size());}); + + for(int pp = 1; pp < n_prop; ++pp) + { + prop += n_samp; + std::transform(phi.begin() + start, phi.begin() + end, scores + start, scores + start, [&prop, &sizes, &log_feat](node_ptr feat, double sc){return std::min(-1 * util_funcs::log_r2(prop, feat->value_ptr(), log_feat.data(), sizes.data(), sizes.size()), sc);}); + } + std::transform(scores + start, scores + end, scores + start, [](double score){return std::isnan(score) ? 0.0 : score;}); + + #pragma omp barrier + } +} + void project_funcs::project_classify(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& sizes, int n_prop) { int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); @@ -89,6 +112,22 @@ void project_funcs::project_r2_no_omp(double* prop, double* scores, std::vector< std::transform(scores, scores + phi.size(), scores, [](double score){return std::isnan(score) ? 0.0 : score;}); } +void project_funcs::project_log_r2_no_omp(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& sizes, int n_prop) +{ + int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); + + std::vector<double> log_feat(n_samp, 0.0); + + std::transform(phi.begin(), phi.end(), scores, [&prop, &sizes, &log_feat](node_ptr feat){return -1 * util_funcs::log_r2(prop, feat->value_ptr(), log_feat.data(), sizes.data(), sizes.size());}); + + for(int pp = 1; pp < n_prop; ++pp) + { + prop += n_samp; + std::transform(phi.begin(), phi.end(), scores, scores, [&prop, &sizes, &log_feat](node_ptr feat, double sc){return std::min(-1 * util_funcs::log_r2(prop, feat->value_ptr(), log_feat.data(), sizes.data(), sizes.size()), sc);}); + } + std::transform(scores, scores + phi.size(), scores, [](double score){return std::isnan(score) ? 0.0 : score;}); +} + void project_funcs::project_classify_no_omp(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& sizes, int n_prop) { int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); diff --git a/src/utils/project.hpp b/src/utils/project.hpp index 09aed1ac..02e84070 100644 --- a/src/utils/project.hpp +++ b/src/utils/project.hpp @@ -38,6 +38,17 @@ namespace project_funcs */ void project_r2(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& size, int n_prop); + /** + * @brief Calculate the projection scores of a set of features to a vector via Coefficient of Determination (on the log transformed problem) + * + * @param prop The pointer to the property vector to calculate the Coefficient of Determination against + * @param scores The pointer of the score vector + * @param phi The set of features to calculate the Coefficient of Determination of + * @param size Vector of the size of all of the tasks + * @param n_prop The number of properties to calculate the Coefficient of Determination of and return the maximum of + */ + void project_log_r2(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& size, int n_prop); + /** * @brief Calculate projection scores for classification * @details Create a 1D-convex hull for each class and calculate the projection score @@ -71,6 +82,17 @@ namespace project_funcs */ void project_r2_no_omp(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& size, int n_prop); + /** + * @brief Calculate the projection scores of a set of features to a vector via Coefficient of Determination (on the log transform problem) + * + * @param prop The pointer to the property vector to calculate the Coefficient of Determination against + * @param scores The pointer of the score vector + * @param phi The set of features to calculate the Coefficient of Determination of + * @param size Vector of the size of all of the tasks + * @param n_prop The number of properties to calculate the Coefficient of Determination of and return the maximum of + */ + void project_log_r2_no_omp(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& size, int n_prop); + /** * @brief Calculate projection scores for classification * @details Create a 1D-convex hull for each class and calculate the projection score -- GitLab