Skip to content
Snippets Groups Projects
classification.py 3.64 KiB
# Copyright 2021 Thomas A. R. Purcell
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Retrain the SVM decision boundaries of a classification model

Functions:

update_model_svm: Generate a new model with an updated SVM from sckitlearn
"""
from sklearn import svm
import numpy as np
from sissopp import ModelClassifier


def update_model_svm(model, c=1.0, max_iter=-1, tol=0.0001, filename=None):
    """Generate a new model with an updated SVM from sckitlearn

    Args:
        model (list of str, str, or ModelClassifier): The model to be updated
        c (float): The new c value to use
        max_iter(int): The maximum number of iterations to use
        tol(float): Maximum allowable error
        filename (str): Filename to store the updated model

    Returns:
        ModelClassifier: The updated model with better SVM parameters
    """
    if isinstance(model, str):
        model = ModelClassifier(model)
    elif isinstance(model, list):
        if (
            len(model) != 2
            or not isinstance(model[0], str)
            or not isinstance(model[1], str)
        ):
            raise ValeError(
                "If model is a list it must only contain the train/test filenames in that order."
            )
        model = ModelClassifier(model[0], model[1])

    start_train = 0
    start_test = 0
    updated_coefs = []
    updated_prop_train_est = []
    updated_prop_test_est = []

    for ts_train, ts_test in zip(model.task_size_train, model.task_size_test):
        X = np.column_stack(
            [feat.value[start_train : start_train + ts_train] for feat in model.feats]
        )

        c0 = np.min(X, axis=0)
        a = 1.0 / (np.max(X, axis=0) - c0)

        lin_clf = svm.LinearSVC(C=c, max_iter=max_iter, tol=tol)
        lin_clf.fit(
            a * (X - c0), model.prop_train[start_train : start_train + ts_train]
        )

        if model.fix_intercept:
            updated_coefs.append(
                np.column_stack((lin_clf.coef_, np.zeros(len(lin_clf.coef_))))
            )
        else:
            updated_coefs.append(np.column_stack((lin_clf.coef_, lin_clf.intercept_)))

        for cc in range(len(lin_clf.coef_)):
            for dd in range(model.n_dim):
                updated_coefs[-1][cc][dd] = a[dd] * lin_clf.coef_[cc][dd]
                updated_coefs[-1][cc][-1] -= c0[dd] * updated_coefs[-1][cc][dd]

        updated_prop_train_est.append(lin_clf.predict(a * (X - c0)))

        if ts_test > 0:
            X = np.column_stack(
                [
                    feat.test_value[start_test : start_test + ts_test]
                    for feat in model.feats
                ]
            )
            updated_prop_test_est.append(lin_clf.predict(a * (X - c0)))
        else:
            updated_prop_test_est.append([])
        start_train += ts_train
        start_test += ts_test

    print("The updated coefficient for the decision boundaries:")
    print(updated_coefs)

    new_model = ModelClassifier(
        model,
        np.row_stack(updated_coefs),
        np.concatenate(updated_prop_train_est),
        np.concatenate(updated_prop_test_est),
    )

    if filename:
        new_model.to_file(filename, True)

    return new_model