From 4259aefb52eeda9cd740daa8b7f63d5ab0dfb076 Mon Sep 17 00:00:00 2001
From: Thomas Purcell <purcell@fhi-berlin.mpg.de>
Date: Tue, 1 Sep 2020 17:32:55 +0200
Subject: [PATCH] Add projection for classification

Projection score added for classification problems, now onto the di part
---
 src/convex_hull/utils.cpp                     | 95 +++++++++++++++++++
 src/convex_hull/utils.hpp                     | 41 ++++++++
 .../feature_space/FeatureSpace.cpp            | 13 ++-
 .../feature_space/FeatureSpace.hpp            | 14 ++-
 src/inputs/InputParser.cpp                    |  3 +-
 src/inputs/InputParser.hpp                    |  1 +
 src/python/__init__.py                        | 22 ++++-
 src/python/bindings_docstring_keyed.cpp       |  6 +-
 src/python/feature_creation/FeatureSpace.cpp  | 14 ++-
 src/utils/math_funcs.hpp                      | 26 ++++-
 src/utils/project.cpp                         | 18 +++-
 src/utils/project.hpp                         | 21 +++-
 12 files changed, 255 insertions(+), 19 deletions(-)
 create mode 100644 src/convex_hull/utils.cpp
 create mode 100644 src/convex_hull/utils.hpp

diff --git a/src/convex_hull/utils.cpp b/src/convex_hull/utils.cpp
new file mode 100644
index 00000000..6031682e
--- /dev/null
+++ b/src/convex_hull/utils.cpp
@@ -0,0 +1,95 @@
+#include <convex_hull/utils.hpp>
+std::vector<double> convex_hull::SORTED_VALUE;
+std::vector<int> convex_hull::SORTED_PROP_INDS;
+
+std::vector<std::vector<double>> convex_hull::CLS_MAX;
+std::vector<std::vector<double>> convex_hull::CLS_MIN;
+std::vector<std::vector<int>> convex_hull::CLS_SZ;
+
+std::vector<double> convex_hull::TASK_SCORES;
+
+int convex_hull::N_CLASS = 0;
+int convex_hull::N_TASK = 0;
+
+void convex_hull::initialize_projection(std::vector<int>& sizes, double* prop)
+{
+    int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0);
+
+    SORTED_PROP_INDS = std::vector<int>(n_samp);
+    std::iota(SORTED_PROP_INDS.begin(), SORTED_PROP_INDS.end(), 0);
+
+    SORTED_VALUE = std::vector<double>(SORTED_PROP_INDS.size(), 0.0);
+    CLS_SZ = std::vector<std::vector<int>>(sizes.size());
+
+    int start = 0;
+    for(int tt = 0; tt < sizes.size(); ++tt)
+    {
+        int start_original = start;
+        util_funcs::argsort(SORTED_PROP_INDS.data() + start, SORTED_PROP_INDS.data() + start + sizes[tt], prop + start);
+        for(int pp = start + 1; pp < start_original + sizes[tt]; ++pp)
+        {
+            if(prop[SORTED_PROP_INDS[pp]] != prop[SORTED_PROP_INDS[pp - 1]])
+            {
+                CLS_SZ[tt].push_back(pp - start);
+                start += pp - start;
+            }
+        }
+        CLS_SZ[tt].push_back(sizes[tt] + start_original - start);
+        start = start_original + sizes[tt];
+    }
+
+    CLS_MAX = std::vector<std::vector<double>>(CLS_SZ.size());
+    CLS_MIN = std::vector<std::vector<double>>(CLS_SZ.size());
+
+    N_TASK = sizes.size();
+    TASK_SCORES = std::vector<double>(N_TASK, 0.0);
+
+    for(int tt = 0; tt < N_TASK; ++tt)
+    {
+        CLS_MAX[tt] = std::vector<double>(CLS_SZ[tt].size(), 0);
+        CLS_MIN[tt] = std::vector<double>(CLS_SZ[tt].size(), 0);
+        N_CLASS = std::max(N_CLASS, static_cast<int>((CLS_SZ[tt].size())));
+    }
+}
+
+double convex_hull::overlap_1d(double* value, double width)
+{
+    std::fill_n(TASK_SCORES.data(), TASK_SCORES.size(), 0.0);
+    for(int ii = 0; ii < SORTED_PROP_INDS.size(); ++ii)
+        SORTED_VALUE[ii] = value[SORTED_PROP_INDS[ii]];
+
+    int start = 0;
+    for(int tt = 0; tt < N_TASK; ++tt)
+    {
+        for(int cc = 0; cc < N_CLASS; ++cc)
+        {
+            CLS_MAX[tt][cc] = *std::max_element(SORTED_VALUE.begin() + start, SORTED_VALUE.begin() + start + CLS_SZ[tt][cc]) + width;
+            CLS_MIN[tt][cc] = *std::min_element(SORTED_VALUE.begin() + start, SORTED_VALUE.begin() + start + CLS_SZ[tt][cc]) - width;
+            start += CLS_SZ[tt][cc];
+        }
+    }
+
+    start = 0;
+    for(int tt = 0; tt < N_TASK; ++tt)
+    {
+        double cls_norm = 2.0 / (static_cast<double>(CLS_SZ[tt].size()) * static_cast<double>(CLS_SZ[tt].size() - 1));
+        for(int c1 = 0; c1 < CLS_SZ[tt].size(); ++c1)
+        {
+            start = 0;
+            double min = CLS_MIN[tt][c1];
+            double max = CLS_MAX[tt][c1];
+            for(int c2 = 0; c2 < CLS_SZ[tt].size(); ++c2)
+            {
+                if(c1 == c2)
+                {
+                    start += CLS_SZ[tt][c2];
+                    continue;
+                }
+                TASK_SCORES[tt] += std::count_if(SORTED_VALUE.begin() + start, SORTED_VALUE.begin() + start + CLS_SZ[tt][c2], [min, max](double val){return (val >= min) && (val <= max);});
+                TASK_SCORES[tt] += cls_norm * (1.0 - std::min(CLS_MAX[tt][c1] - CLS_MIN[tt][c2], CLS_MAX[tt][c2] - CLS_MIN[tt][c1]) / std::max(CLS_MAX[tt][c1] - CLS_MIN[tt][c2], CLS_MAX[tt][c2] - CLS_MIN[tt][c1]));
+                start += CLS_SZ[tt][c2];
+            }
+        }
+    }
+    return util_funcs::mean(TASK_SCORES);
+}
\ No newline at end of file
diff --git a/src/convex_hull/utils.hpp b/src/convex_hull/utils.hpp
new file mode 100644
index 00000000..01beea50
--- /dev/null
+++ b/src/convex_hull/utils.hpp
@@ -0,0 +1,41 @@
+/** @file convex_hull/utils.hpp
+ *  @brief A set of functions to create convex hulls, and compare overlap/distance
+ *
+ *  @author Thomas A. R. Purcell (tpurcell)
+ *  @bug No known bugs.
+ */
+
+#ifndef CONVEX_UTILS
+#define CONVEX_UTILS
+
+#include <utils/math_funcs.hpp>
+#include <iomanip>
+namespace convex_hull
+{
+    extern std::vector<double> SORTED_VALUE; //!< The value sorted based on the property
+    extern std::vector<int> SORTED_PROP_INDS; //!< The value sorted based on the property
+
+    extern std::vector<std::vector<double>> CLS_MAX; //!< The maximum value of the property in each class
+    extern std::vector<std::vector<double>> CLS_MIN; //!< The minimum value of the property in each class
+    extern std::vector<std::vector<int>> CLS_SZ; //!< The number of values in each class
+
+    extern std::vector<double> TASK_SCORES; //!< The scores for each task
+
+    extern int N_CLASS; //!< Number of classes
+    extern int N_TASK; //!< Number of tasks
+
+    void initialize_projection(std::vector<int>& sizes, double* prop);
+
+    /**
+     * @brief Calculate the projection scores of a set of features to a vector via Pearson correlation
+     *
+     * @param prop The pointer to the property vector to calculate the Pearson correlation against
+     * @param value The pointer to the value of the data
+     * @param inds Vector storing the indexes of the sorted porperty value
+     * @param class_szs number of elements in each class
+     * @returns The projection score for the particular feature
+     */
+    double overlap_1d(double* value, double width = 1e-10);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/feature_creation/feature_space/FeatureSpace.cpp b/src/feature_creation/feature_space/FeatureSpace.cpp
index 79488437..1755edb1 100644
--- a/src/feature_creation/feature_space/FeatureSpace.cpp
+++ b/src/feature_creation/feature_space/FeatureSpace.cpp
@@ -25,6 +25,7 @@ FeatureSpace::FeatureSpace(
     std::vector<std::string> allowed_ops,
     std::vector<double> prop,
     std::vector<int> task_sizes,
+    std::string project_type,
     int max_phi,
     int n_sis_select,
     int max_store_rung,
@@ -53,10 +54,10 @@ FeatureSpace::FeatureSpace(
     _n_rung_generate(n_rung_generate)
 
 {
-    initialize_fs(prop);
+    initialize_fs(prop, project_type);
 }
 
-void FeatureSpace::initialize_fs(std::vector<double> prop)
+void FeatureSpace::initialize_fs(std::vector<double> prop, std::string project_type)
 {
     if(_n_rung_store == -1)
         _n_rung_store = _max_phi - 1;
@@ -76,7 +77,13 @@ void FeatureSpace::initialize_fs(std::vector<double> prop)
         out_file_stream.close();
         sum_file_stream.close();
     }
-    _project = project_funcs::project_r;
+
+    if(project_type.compare("regression") == 0)
+        _project = project_funcs::project_r2;
+    else if(project_type.compare("classification") == 0)
+        _project = project_funcs::project_classify;
+    else
+        throw std::logic_error("Wrong projection type passed to FeatureSpace constructor.");
 
     for(auto & op : _allowed_ops)
     {
diff --git a/src/feature_creation/feature_space/FeatureSpace.hpp b/src/feature_creation/feature_space/FeatureSpace.hpp
index a2586d65..3cb7cc66 100644
--- a/src/feature_creation/feature_space/FeatureSpace.hpp
+++ b/src/feature_creation/feature_space/FeatureSpace.hpp
@@ -76,6 +76,8 @@ public:
      * @param phi_0 The initial set of features to combine
      * @param allowed_ops list of allowed operators
      * @param prop The property to be learned (training data)
+     * @param task_sizes The number of samples per task
+     * @param project_type The projection operator to use
      * @param max_phi highest rung value for the calculation
      * @param n_sis_select number of features to select during each SIS step
      * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
@@ -90,6 +92,7 @@ public:
         std::vector<std::string> allowed_ops,
         std::vector<double> prop,
         std::vector<int> task_sizes,
+        std::string project_type="pearson",
         int max_phi=1,
         int n_sis_select=1,
         int max_store_rung=-1,
@@ -104,7 +107,7 @@ public:
      *
      * @param prop The property trying to be learned
      */
-    void initialize_fs(std::vector<double> prop);
+    void initialize_fs(std::vector<double> prop, std::string project_type);
 
     /**
      * @brief Generate the full feature set from the allowed operators and initial feature set
@@ -264,6 +267,8 @@ public:
          * @param phi_0 The initial set of features to combine
          * @param allowed_ops list of allowed operators
          * @param prop The property to be learned (training data)
+         * @param task_sizes The number of samples per task
+         * @param project_type The projection operator to use
          * @param max_phi highest rung value for the calculation
          * @param n_sis_select number of features to select during each SIS step
          * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
@@ -277,6 +282,7 @@ public:
             py::list allowed_ops,
             py::list prop,
             py::list task_sizes,
+            std::string project_type="pearson",
             int max_phi=1,
             int n_sis_select=1,
             int max_store_rung=-1,
@@ -293,6 +299,8 @@ public:
          * @param phi_0 The initial set of features to combine
          * @param allowed_ops list of allowed operators
          * @param prop The property to be learned (training data)
+         * @param task_sizes The number of samples per task
+         * @param project_type The projection operator to use
          * @param max_phi highest rung value for the calculation
          * @param n_sis_select number of features to select during each SIS step
          * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
@@ -306,6 +314,7 @@ public:
             py::list allowed_ops,
             np::ndarray prop,
             py::list task_sizes,
+            std::string project_type="pearson",
             int max_phi=1,
             int n_sis_select=1,
             int max_store_rung=-1,
@@ -321,6 +330,8 @@ public:
          *
          * @param feature_file The file with the postfix expressions for the feature space
          * @param phi_0 The initial set of features to combine
+         * @param task_sizes The number of samples per task
+         * @param project_type The projection operator to use
          * @param n_sis_select number of features to select during each SIS step
          * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
          * @param cross_corr_max Maximum cross-correlation used for selecting features
@@ -329,6 +340,7 @@ public:
             std::string feature_file,
             py::list phi_0,
             py::list task_sizes,
+            std::string project_type="pearson",
             int n_sis_select=1,
             double cross_corr_max=1.0
         );
diff --git a/src/inputs/InputParser.cpp b/src/inputs/InputParser.cpp
index a76fb7d8..f04da374 100644
--- a/src/inputs/InputParser.cpp
+++ b/src/inputs/InputParser.cpp
@@ -6,6 +6,7 @@ InputParser::InputParser(boost::property_tree::ptree IP, std::string fn, std::sh
     _data_file(IP.get<std::string>("data_file", "data.csv")),
     _prop_key(IP.get<std::string>("property_key", "prop")),
     _task_key(IP.get<std::string>("task_key", "Task")),
+    _calc_type(IP.get<std::string>("calc_type", "regression")),
     _leave_out_inds(as_vector<int>(IP, "leave_out_inds")),
     _cross_cor_max(IP.get<double>("max_feat_cross_correlation", 1.0)),
     _l_bound(IP.get<double>("min_abs_feat_val", 1e-50)),
@@ -273,7 +274,7 @@ void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm, st
     for(int ff = 0; ff < headers.size(); ++ff)
         phi_0.push_back(std::make_shared<FeatureNode>(ff, headers[ff], data[ff], test_data[ff], units[ff]));
 
-    _feat_space = std::make_shared<FeatureSpace>(comm, phi_0, _opset, _prop_train, _task_sizes_train, _max_rung, _n_sis_select, _max_store_rung, _n_rung_generate, _cross_cor_max, _l_bound, _u_bound);
+    _feat_space = std::make_shared<FeatureSpace>(comm, phi_0, _opset, _prop_train, _task_sizes_train, _calc_type, _max_rung, _n_sis_select, _max_store_rung, _n_rung_generate, _cross_cor_max, _l_bound, _u_bound);
 }
 
 void stripComments(std::string& filename)
diff --git a/src/inputs/InputParser.hpp b/src/inputs/InputParser.hpp
index c6da6b7d..fd1834d3 100644
--- a/src/inputs/InputParser.hpp
+++ b/src/inputs/InputParser.hpp
@@ -48,6 +48,7 @@ public:
     std::string _data_file; //!< Name of the data file
     std::string _prop_key; //!< Key used to find the property column in the data file
     std::string _task_key; //!< Key used to find the task column in the data file
+    std::string _calc_type; //!< Type of projection operator to use
 
     std::shared_ptr<FeatureSpace> _feat_space; //!< shared_ptr to the FeatureSpace generated from the data file and the input file
 
diff --git a/src/python/__init__.py b/src/python/__init__.py
index 0827a2fe..469b8657 100644
--- a/src/python/__init__.py
+++ b/src/python/__init__.py
@@ -77,6 +77,7 @@ def generate_phi_0_from_csv(
     # Load csv file
     if isinstance(df, str):
         df = pd.read_csv(df, index_col=0)
+    df = df.astype(float)
     df_cols = df.columns.to_list()
     col_exprs = [c.split(" (")[0] for c in df_cols]
     prop_ind = np.where([c_ex == prop_key.split(" (")[0] for c_ex in col_exprs])[0][0]
@@ -191,7 +192,9 @@ def generate_phi_0_from_csv(
     )
 
 
-def generate_fs(phi_0, prop, task_sizes_train, allowed_ops, max_phi, n_sis_select):
+def generate_fs(
+    phi_0, prop, task_sizes_train, allowed_ops, calc_type, max_phi, n_sis_select
+):
     """Generate a FeatureSet for the calculation
 
     Args:
@@ -200,9 +203,8 @@ def generate_fs(phi_0, prop, task_sizes_train, allowed_ops, max_phi, n_sis_selec
         task_sizes_train (list): The number of samples in the training data for each task
         allowed_ops (list): List of operations used to combine the features
         max_phi (int): Maximum rung for the calculation
+        calc_type (str): type of calculation regression or classification
         n_sis_select (int): number of features to select in each round of SIS
-        task_key (str): The key corresponding to which column in the csv file the task differentiation is stored in
-        leave_out_frac (list): List of indices to pull from the training data to act as a test set
 
     Returns:
         fs (FeatureSpace): The FeatureSpace for the calculation
@@ -230,7 +232,13 @@ def generate_fs(phi_0, prop, task_sizes_train, allowed_ops, max_phi, n_sis_selec
         ]
 
     return FeatureSpace(
-        phi_0, allowed_ops, list(prop), task_sizes_train, max_phi, n_sis_select
+        phi_0,
+        allowed_ops,
+        list(prop),
+        task_sizes_train,
+        calc_type,
+        max_phi,
+        n_sis_select,
     )
 
 
@@ -242,6 +250,7 @@ def generate_fs_sr_from_csv(
     max_phi,
     n_sis_select,
     max_dim,
+    calc_type="regression",
     n_residuals=1,
     n_model_store=1,
     task_key=None,
@@ -258,6 +267,7 @@ def generate_fs_sr_from_csv(
         max_phi (int): Maximum rung for the calculation
         n_sis_select (int): number of features to select in each round of SIS
         max_dim (int): Maximum dimension of the models to learn
+        calc_type (str): type of calculation regression or classification
         n_residuals (int): number of residuals to use for the next SIS step when learning higher dimensional models
         task_key (str): The key corresponding to which column in the csv file the task differentiation is stored in
         leave_out_frac (float): Fraction of samples to leave out
@@ -276,7 +286,9 @@ def generate_fs_sr_from_csv(
         leave_out_inds=leave_out_inds,
     )
 
-    fs = generate_fs(phi_0, prop, task_sizes_train, allowed_ops, max_phi, n_sis_select)
+    fs = generate_fs(
+        phi_0, prop, task_sizes_train, allowed_ops, calc_type, max_phi, n_sis_select
+    )
 
     sr = SISSORegressor(
         fs,
diff --git a/src/python/bindings_docstring_keyed.cpp b/src/python/bindings_docstring_keyed.cpp
index 4cd21828..49263ccf 100644
--- a/src/python/bindings_docstring_keyed.cpp
+++ b/src/python/bindings_docstring_keyed.cpp
@@ -40,9 +40,9 @@ void sisso::feature_creation::registerFeatureSpace()
     void (FeatureSpace::*sis_list)(list) = &FeatureSpace::sis;
     void (FeatureSpace::*sis_ndarray)(np::ndarray) = &FeatureSpace::sis;
 
-    class_<FeatureSpace>("FeatureSpace", init<list, list, np::ndarray, list, optional<int, int, int, int, double, double, double>>())
-        .def(init<list, list, list, list, optional<int, int, int, int, double, double, double>>())
-        .def(init<std::string, list, list, optional<int, double>>())
+    class_<FeatureSpace>("FeatureSpace", init<list, list, np::ndarray, list, optional<std::string, int, int, int, int, double, double, double>>())
+        .def(init<list, list, list, list, optional<std::string, int, int, int, int, double, double, double>>())
+        .def(init<std::string, list, list, optional<std::string, int, double>>())
         .def("sis", sis_list, "@DocString_feat_space_sis_list@")
         .def("sis", sis_ndarray, "@DocString_feat_space_sis_arr@")
         .def("feat_in_phi", &FeatureSpace::feat_in_phi, "@DocString_feat_space_feat_in_phi@")
diff --git a/src/python/feature_creation/FeatureSpace.cpp b/src/python/feature_creation/FeatureSpace.cpp
index 9f38bbfb..c7c24388 100644
--- a/src/python/feature_creation/FeatureSpace.cpp
+++ b/src/python/feature_creation/FeatureSpace.cpp
@@ -5,6 +5,7 @@ FeatureSpace::FeatureSpace(
     py::list allowed_ops,
     py::list prop,
     py::list task_sizes,
+    std::string project_type,
     int max_phi,
     int n_sis_select,
     int max_store_rung,
@@ -32,7 +33,7 @@ FeatureSpace::FeatureSpace(
     _n_rung_generate(n_rung_generate)
 {
     _n_samp = _phi_0[0]->n_samp();
-    initialize_fs(python_conv_utils::from_list<double>(prop));
+    initialize_fs(python_conv_utils::from_list<double>(prop), project_type);
 }
 
 FeatureSpace::FeatureSpace(
@@ -40,6 +41,7 @@ FeatureSpace::FeatureSpace(
     py::list allowed_ops,
     np::ndarray prop,
     py::list task_sizes,
+    std::string project_type,
     int max_phi,
     int n_sis_select,
     int max_store_rung,
@@ -67,13 +69,14 @@ FeatureSpace::FeatureSpace(
     _n_rung_generate(n_rung_generate)
 {
     _n_samp = _phi_0[0]->n_samp();
-    initialize_fs(python_conv_utils::from_ndarray<double>(prop));
+    initialize_fs(python_conv_utils::from_ndarray<double>(prop), project_type);
 }
 
 FeatureSpace::FeatureSpace(
     std::string feature_file,
     py::list phi_0,
     py::list task_sizes,
+    std::string project_type,
     int n_sis_select,
     double cross_corr_max
 ):
@@ -92,7 +95,12 @@ FeatureSpace::FeatureSpace(
     _n_rung_generate(0)
 {
     _n_samp = _phi_0[0]->n_samp();
-    _project = project_funcs::project_r;
+    if(project_type.compare("regression") == 0)
+        _project = project_funcs::project_r2;
+    else if(project_type.compare("classification") == 0)
+        _project = project_funcs::project_classify;
+    else
+        throw std::logic_error("Wrong projection type passed to FeatureSpace constructor.");
 
     std::vector<node_ptr> phi_temp = str2node::phi_from_file(feature_file, _phi_0);
     _n_feat = phi_temp.size();
diff --git a/src/utils/math_funcs.hpp b/src/utils/math_funcs.hpp
index 9ddbcf7f..5d613b94 100644
--- a/src/utils/math_funcs.hpp
+++ b/src/utils/math_funcs.hpp
@@ -207,9 +207,9 @@ namespace util_funcs
     /**
      * @brief Sort a vector and return the indexes of the unsorted array that corresponds to the sorted one
      *
-     * @param vec vector to sort
      * @param begin the starting point for the sorting
      * @param end the end point for the sorting
+     * @param vec vector to sort
      * @return The indexes of the sorted array
      */
     inline void argsort(int* begin, int* end, std::vector<double>& vec)
@@ -217,6 +217,18 @@ namespace util_funcs
         std::sort(begin, end, [&vec](int i1, int i2){return vec[i1] < vec[i2];});
     }
 
+    /**
+     * @brief Sort a vector and return the indexes of the unsorted array that corresponds to the sorted one
+     *
+     * @param begin the starting point for the sorting
+     * @param end the end point for the sorting
+     * @param vec_begin start pointer to the vector to sort
+     */
+    inline void argsort(int* begin, int* end, double* vec_begin)
+    {
+        std::sort(begin, end, [vec_begin](int i1, int i2){return vec_begin[i1] < vec_begin[i2];});
+    }
+
     /**
      * @brief Sort a vector and return the indexes of the unsorted array that corresponds to the sorted one
      *
@@ -245,6 +257,18 @@ namespace util_funcs
         std::sort(begin, end, [&vec](int i1, int i2){return vec[i1] < vec[i2];});
     }
 
+    /**
+     * @brief Sort a vector and return the indexes of the unsorted array that corresponds to the sorted one
+     *
+     * @param begin the starting point for the sorting
+     * @param end the end point for the sorting
+     * @param vec_begin start pointer to the vector to sort
+     */
+    inline void argsort(int* begin, int* end, int* vec_begin)
+    {
+        std::sort(begin, end, [vec_begin](int i1, int i2){return vec_begin[i1] < vec_begin[i2];});
+    }
+
     /**
      * @brief The maximum absolute value of the vector
      *
diff --git a/src/utils/project.cpp b/src/utils/project.cpp
index c2f0518d..615ad15c 100644
--- a/src/utils/project.cpp
+++ b/src/utils/project.cpp
@@ -25,4 +25,20 @@ void project_funcs::project_r2(double* prop, double* scores, std::vector<node_pt
         std::transform(phi.begin(), phi.end(), scores, scores, [&prop, &sizes](node_ptr feat, double sc){return std::min(-1 * util_funcs::r2(prop, feat->value_ptr(), sizes.data(), sizes.size()), sc);});
     }
     std::transform(scores, scores + phi.size(), scores, [](double score){return std::isnan(score) ? 0.0 : score;});
-}
\ No newline at end of file
+}
+
+void project_funcs::project_classify(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& sizes, int n_prop)
+{
+    int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0);
+
+    convex_hull::initialize_projection(sizes, prop);
+    std::transform(phi.begin(), phi.end(), scores, [](node_ptr feat){return convex_hull::overlap_1d(feat->value_ptr());});
+
+    for(int pp = 1; pp < n_prop; ++pp)
+    {
+        prop += n_samp;
+        convex_hull::initialize_projection(sizes, prop);
+        std::transform(phi.begin(), phi.end(), scores, [](node_ptr feat){return convex_hull::overlap_1d(feat->value_ptr());});
+    }
+    std::transform(scores, scores + phi.size(), scores, [](double score){return std::isnan(score) ? std::numeric_limits<double>::infinity() : score;});
+}
diff --git a/src/utils/project.hpp b/src/utils/project.hpp
index 44df323c..746b0a45 100644
--- a/src/utils/project.hpp
+++ b/src/utils/project.hpp
@@ -4,7 +4,13 @@
  *  @author Thomas A. R. Purcell (tpurcell)
  *  @bug No known bugs.
  */
+#ifndef UTILS_PROJECT
+#define UTILS_PROJECT
+
+#include <limits>
+
 #include <feature_creation/node/Node.hpp>
+#include <convex_hull/utils.hpp>
 
 namespace project_funcs
 {
@@ -29,4 +35,17 @@ namespace project_funcs
      * @param n_prop The number of properties to calculate the Coefficient of Determination of and return the maximum of
      */
     void project_r2(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& size, int n_prop);
-}
\ No newline at end of file
+
+    /**
+     * @brief Calculate projection scores for classification
+     * @details Create a 1D-convex hull for each class and calculate the projection score
+     *
+     * @param prop The classes to separate the property
+     * @param scores the vector to store the projection scores
+     * @param size list of the sizes for each task
+     * @param n_prop number of properties
+     */
+    void project_classify(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& size, int n_prop);
+}
+
+#endif
\ No newline at end of file
-- 
GitLab