import os
import sys
from abc import ABCMeta, abstractmethod
from datetime import datetime
from pathlib import Path
from typing import Union
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from .. import __version__
from ..utils import CANARY_MODEL_STORAGE_LOCATION, logger, get_is_dev
__all__ = [
"Model"
]
[docs]class Model(metaclass=ABCMeta):
"""Abstract class that other Canary models descend from
"""
@abstractmethod
def __init__(self, model_id=None):
"""Constructor method
Parameters
----------
model_id: str
The model ID. The model ID must never be None.
"""
self.__model_id = model_id
self._model = None
self._metrics = {}
self._metadata = {}
self.__model_dir = CANARY_MODEL_STORAGE_LOCATION
# Model ID used as part of filename so
# Should not be empty
if model_id is None:
raise ValueError("Model ID cannot be none")
# Try and make relevant directories
os.makedirs(self.__model_dir, exist_ok=True)
def __repr__(self):
return self.model_id
@property
def model_id(self):
"""Returns the model id
Returns
-------
str
The model id
"""
return self.__model_id
@property
def supports_probability(self):
"""
Returns a boolean if the model supports probability prediction.
Returns
-------
str
The boolean value indicating if probability predictions are possible.
"""
if hasattr(self._model, 'predict_proba'):
return True
return False
[docs] def fit(self, training_data: list, training_labels: list):
"""Fits a model to the training data.
Parameters
----------
training_data: list
The training data on which the model is fitted
training_labels: list:
The training labels on which the data is fitted to
Returns
-------
self
"""
self._model.fit(training_data, training_labels)
return self
@property
def metrics(self):
"""Property which returns model metrics
Returns
-------
dict
Returns the metrics of the model as a dict
Examples
--------
>>> self.metrics
{"f1score" 54.6, ...}
"""
return self._metrics
[docs] def set_model(self, model):
"""Set the scikit-learn model that sits under self._model
Parameters
----------
model
a model that conforms to the standard scikit-learn API
"""
self._model = model
[docs] def save(self, save_to: Path = None):
"""Saves the model to disk after training
Parameters
----------
save_to: str
Where to save the model
"""
self._metadata = {
"canary_version_trained_with": __version__,
"python_version_trained_with": tuple(sys.version_info),
"trained_on": datetime.now()
}
if save_to is None:
logger.info(f"Saving {self.model_id}")
joblib.dump(self, Path(self.__model_dir) / f"{self.model_id}.joblib", compress=2)
else:
logger.info(f"Saving {self.model_id} to {save_to}.")
joblib.dump(self, Path(save_to) / f"{self.model_id}.joblib", compress=2)
[docs] @classmethod
def train(cls, pipeline_model=None, train_data=None, test_data=None, train_targets=None, test_targets=None,
save_on_finish=True, *args, **kwargs):
"""Classmethod which initialises a model and trains it on the provided training data.
Parameters
----------
pipeline_model
The model which is trained to make predictions
train_data: list
Training data
test_data: list
Test data
train_targets: list
The training labels
test_targets: list
The test labels
save_on_finish: bool
Should the model be saved when training has finished?
*args: tuple
Additional positional arguments
**kwargs: dict
Additional keyed-arguments
Returns
-------
Model
The model instance
"""
model = cls()
# We need all of the below items to continue
if any(item is None for item in [train_data, test_data, train_targets, test_targets]):
# Check if we have a default training method
if hasattr(model, "default_train"):
logger.debug("Using default training method")
train_data, test_data, train_targets, test_targets = model.default_train()
return model.train(pipeline_model=pipeline_model, train_data=train_data, test_data=test_data,
train_targets=train_targets, test_targets=test_targets,
save_on_finish=save_on_finish,
*args,
**kwargs)
else:
raise ValueError(
"Missing required training / test data in method call. "
"There is no default training method for this model."
" Please supply these and try again.")
logger.debug(f"Training of {model.__class__.__name__} has begun")
if pipeline_model is None:
pipeline_model = LogisticRegression(random_state=0)
logger.warn("No model selected. Defaulting to Logistic Regression.")
model.set_model(pipeline_model)
model.fit(train_data, train_targets)
prediction = model.predict(test_data)
if get_is_dev() is True:
from ._utils import log_training_data
log_training_data({"result": classification_report(test_targets, prediction, output_dict=True),
"datetime": str(datetime.now()), "model": model.__class__.__name__})
logger.debug(f"\nModel stats:\n{classification_report(test_targets, prediction)}")
model._metrics = classification_report(test_targets, prediction, output_dict=True)
if save_on_finish is True:
model.save()
return model
[docs] def predict(self, data, probability=False) -> Union[list, bool]:
"""Make a prediction on some data. A wrapper around scikit-learn's predict method.
Parameters
----------
data:
The data the predictor will be ran on.
probability: bool
boolean indicating if the method should return a probability prediction.
Notes
------
Not all models support probability predictions. This can be checked with the supports_probability property.
Returns
-------
Union[list, bool]
a boolean indicating the predictions or list of predictions
"""
if self._model is None:
raise ValueError(
"Cannot make a prediction because no model has been loaded."
" Either train a model or download the pretrained models.")
if self.supports_probability is False and probability is True:
probability = False
logger.warn(
f"This model doesn't support probability. Probability has been set to {probability}.")
data_type = type(data)
def probability_predict(inp) -> Union[list[dict], dict]:
"""Internal helper function to provide a nicer way of returning probability predictions.
The default 'predict_proba' returns positional floats which requires that you know
the ordering of the classes. This returns the labels along with the float value.
"""
if type(inp) is list:
predictions_list = []
for _, item in enumerate(inp):
predictions_dict = {}
p = self._model.predict_proba([item])[0]
for j, class_ in enumerate(self._model.classes_):
predictions_dict[self._model.classes_[j]] = p[j]
predictions_list.append(predictions_dict)
return predictions_list
else:
predictions_dict = {}
p = self._model.predict_proba(inp)[0]
for _i, class_ in enumerate(self._model.classes_):
predictions_dict[self._model.classes_[_i]] = p[_i]
return predictions_dict
if data_type is list:
predictions = []
if probability is False:
for i, _ in enumerate(data):
predictions.append(self._model.predict([data[i]])[0])
return predictions
else:
return probability_predict(data)
else:
if probability is False:
prediction = self._model.predict([data])[0]
return prediction
elif probability is True:
return probability_predict([data])[0]