Source code for canary.argument_pipeline.base

import os
import sys
from abc import ABCMeta, abstractmethod
from datetime import datetime
from pathlib import Path
from typing import Union

import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from .. import __version__
from ..utils import CANARY_MODEL_STORAGE_LOCATION, logger, get_is_dev

__all__ = [
    "Model"
]


[docs]class Model(metaclass=ABCMeta):
    """Abstract class that other Canary models descend from
    """

    @abstractmethod
    def __init__(self, model_id=None):
        """Constructor method

        Parameters
        ----------
        model_id: str
            The model ID. The model ID must never be None.
        """

        self.__model_id = model_id
        self._model = None
        self._metrics = {}
        self._metadata = {}
        self.__model_dir = CANARY_MODEL_STORAGE_LOCATION

        # Model ID used as part of filename so
        # Should not be empty
        if model_id is None:
            raise ValueError("Model ID cannot be none")

        # Try and make relevant directories
        os.makedirs(self.__model_dir, exist_ok=True)

    def __repr__(self):
        return self.model_id

    @property
    def model_id(self):
        """Returns the model id

        Returns
        -------
        str
            The model id
        """

        return self.__model_id

    @property
    def supports_probability(self):
        """
        Returns a boolean if the model supports probability prediction.

        Returns
        -------
        str
            The boolean value indicating if probability predictions are possible.
        """

        if hasattr(self._model, 'predict_proba'):
            return True
        return False

[docs]    def fit(self, training_data: list, training_labels: list):
        """Fits a model to the training data.

        Parameters
        ----------
        training_data: list
            The training data on which the model is fitted
        training_labels: list:
            The training labels on which the data is fitted to

        Returns
        -------
        self
        """
        self._model.fit(training_data, training_labels)
        return self

    @property
    def metrics(self):
        """Property which returns model metrics

        Returns
        -------
        dict
            Returns the metrics of the model as a dict

        Examples
        --------
        >>> self.metrics
        {"f1score" 54.6, ...}
        """
        return self._metrics

[docs]    def set_model(self, model):
        """Set the scikit-learn model that sits under self._model

        Parameters
        ----------
        model
            a model that conforms to the standard scikit-learn API
        """
        self._model = model

[docs]    def save(self, save_to: Path = None):
        """Saves the model to disk after training

        Parameters
        ----------
        save_to: str
            Where to save the model
        """

        self._metadata = {
            "canary_version_trained_with": __version__,
            "python_version_trained_with": tuple(sys.version_info),
            "trained_on": datetime.now()
        }

        if save_to is None:
            logger.info(f"Saving {self.model_id}")
            joblib.dump(self, Path(self.__model_dir) / f"{self.model_id}.joblib", compress=2)
        else:
            logger.info(f"Saving {self.model_id} to {save_to}.")
            joblib.dump(self, Path(save_to) / f"{self.model_id}.joblib", compress=2)

[docs]    @classmethod
    def train(cls, pipeline_model=None, train_data=None, test_data=None, train_targets=None, test_targets=None,
              save_on_finish=True, *args, **kwargs):
        """Classmethod which initialises a model and trains it on the provided training data.

        Parameters
        ----------
        pipeline_model
            The model which is trained to make predictions
        train_data: list
            Training data
        test_data: list
            Test data
        train_targets: list
            The training labels
        test_targets: list
            The test labels
        save_on_finish: bool
            Should the model be saved when training has finished?
        *args: tuple
            Additional positional arguments
        **kwargs: dict
            Additional keyed-arguments

        Returns
        -------
        Model
            The model instance
        """

        model = cls()

        # We need all of the below items to continue
        if any(item is None for item in [train_data, test_data, train_targets, test_targets]):

            # Check if we have a default training method
            if hasattr(model, "default_train"):
                logger.debug("Using default training method")
                train_data, test_data, train_targets, test_targets = model.default_train()

                return model.train(pipeline_model=pipeline_model, train_data=train_data, test_data=test_data,
                                   train_targets=train_targets, test_targets=test_targets,
                                   save_on_finish=save_on_finish,
                                   *args,
                                   **kwargs)
            else:
                raise ValueError(
                    "Missing required training / test data in method call. "
                    "There is no default training method for this model."
                    " Please supply these and try again.")

        logger.debug(f"Training of {model.__class__.__name__} has begun")

        if pipeline_model is None:
            pipeline_model = LogisticRegression(random_state=0)
            logger.warn("No model selected. Defaulting to Logistic Regression.")

        model.set_model(pipeline_model)
        model.fit(train_data, train_targets)

        prediction = model.predict(test_data)

        if get_is_dev() is True:
            from ._utils import log_training_data
            log_training_data({"result": classification_report(test_targets, prediction, output_dict=True),
                               "datetime": str(datetime.now()), "model": model.__class__.__name__})

        logger.debug(f"\nModel stats:\n{classification_report(test_targets, prediction)}")

        model._metrics = classification_report(test_targets, prediction, output_dict=True)

        if save_on_finish is True:
            model.save()

        return model

[docs]    def predict(self, data, probability=False) -> Union[list, bool]:
        """Make a prediction on some data. A wrapper around scikit-learn's predict method.

        Parameters
        ----------
        data:
            The data the predictor will be ran on.
        probability: bool
            boolean indicating if the method should return a probability prediction.

        Notes
        ------
        Not all models support probability predictions. This can be checked with the supports_probability property.

        Returns
        -------
        Union[list, bool]
            a boolean indicating the predictions or list of predictions
        """

        if self._model is None:
            raise ValueError(
                "Cannot make a prediction because no model has been loaded."
                " Either train a model or download the pretrained models.")

        if self.supports_probability is False and probability is True:
            probability = False
            logger.warn(
                f"This model doesn't support probability. Probability has been set to {probability}.")

        data_type = type(data)

        def probability_predict(inp) -> Union[list[dict], dict]:
            """Internal helper function to provide a nicer way of returning probability predictions.

            The default 'predict_proba' returns positional floats which requires that you know
            the ordering of the classes. This returns the labels along with the float value.
            """

            if type(inp) is list:
                predictions_list = []
                for _, item in enumerate(inp):
                    predictions_dict = {}
                    p = self._model.predict_proba([item])[0]
                    for j, class_ in enumerate(self._model.classes_):
                        predictions_dict[self._model.classes_[j]] = p[j]
                    predictions_list.append(predictions_dict)
                return predictions_list

            else:
                predictions_dict = {}
                p = self._model.predict_proba(inp)[0]
                for _i, class_ in enumerate(self._model.classes_):
                    predictions_dict[self._model.classes_[_i]] = p[_i]
                return predictions_dict

        if data_type is list:
            predictions = []
            if probability is False:
                for i, _ in enumerate(data):
                    predictions.append(self._model.predict([data[i]])[0])
                return predictions
            else:
                return probability_predict(data)
        else:
            if probability is False:
                prediction = self._model.predict([data])[0]
                return prediction
            elif probability is True:
                return probability_predict([data])[0]