Source code for canary.argument_pipeline.structure_prediction

"""The structure prediction module provides functionality in respect toe the prediction
and structure of a document.
"""
import pandas
from scipy.sparse import hstack
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import LabelBinarizer, Normalizer

from ..argument_pipeline.base import Model
from ..nlp._utils import spacy_download
from ..nlp.transformers import WordSentimentCounter, DiscourseMatcher
from ..utils import logger

nlp = spacy_download()

__all__ = [
    "StructurePredictor",
    "StructureFeatures"
]


[docs]class StructurePredictor(Model): def __init__(self, model_id=None): if model_id is None: model_id = "structure_predictor" super().__init__(model_id=model_id)
[docs] @staticmethod def default_train(): """Default training method which supplies the default training set""" from canary.corpora import load_essay_corpus from imblearn.over_sampling import RandomOverSampler from sklearn.model_selection import train_test_split logger.debug("Resample") ros = RandomOverSampler(random_state=0, sampling_strategy='not majority') x, y = load_essay_corpus(purpose="relation_prediction") x, y = ros.fit_resample(pandas.DataFrame(x), pandas.DataFrame(y)) train_data, test_data, train_targets, test_targets = \ train_test_split(x, y, train_size=0.7, shuffle=True, random_state=0, ) return list(train_data.to_dict("index").values()), list(test_data.to_dict("index").values()), train_targets[ 0].tolist(), test_targets[0].tolist()
[docs] @classmethod def train(cls, pipeline_model=None, train_data=None, test_data=None, train_targets=None, test_targets=None, save_on_finish=True, **kwargs): if pipeline_model is None: pipeline_model = make_pipeline( StructureFeatures(), Normalizer(), SGDClassifier(random_state=0, loss="log", )) return super().train( pipeline_model=pipeline_model, train_data=train_data, test_data=test_data, train_targets=train_targets, test_targets=test_targets, save_on_finish=save_on_finish )
[docs]class StructureFeatures(TransformerMixin, BaseEstimator): """A custom feature transformer used for extracting features relevant to structure prediction""" cover_features: list = [ DiscourseMatcher("support"), DiscourseMatcher("conflict"), DiscourseMatcher('forward'), DiscourseMatcher('thesis'), WordSentimentCounter("neu"), WordSentimentCounter("pos"), WordSentimentCounter("neg") ] def __init__(self): self.__arg1_cover_features = make_union(*StructureFeatures.cover_features) self.__arg2_cover_features = make_union(*StructureFeatures.cover_features) self.__dictionary_features = DictVectorizer() self.__ohe_arg1 = LabelBinarizer() self.__ohe_arg2 = LabelBinarizer()
[docs] def fit(self, x, y=None): """Fits self to data provided. Parameters ---------- x: list The data on which the transformer is fitted. y: list, default=None Ignored. Providing will have no effect. Provided for compatibility reasons. Returns ------- self """ self.__dictionary_features.fit(self._prepare_dictionary_features(x)) x = pandas.DataFrame(x) self.__arg1_cover_features.fit(x.arg1_covering_sentence.tolist()) self.__arg2_cover_features.fit(x.arg2_covering_sentence.tolist()) self.__ohe_arg1.fit(x.arg1_type.tolist()) self.__ohe_arg2.fit(x.arg2_type.tolist()) return self
[docs] def transform(self, x) -> list: """Transform data into features Parameters ---------- x: list A list of datapoints which are to be transformed using the mixin Returns ------- scipy.sparse.hstack The features of the inputted list See Also --------- scipy.sparse.hstack """ dictionary_features = self.__dictionary_features.transform(x) x = pandas.DataFrame(x) arg1_cover_features = self.__arg1_cover_features.transform(x.arg1_covering_sentence) arg2_cover_features = self.__arg2_cover_features.transform(x.arg2_covering_sentence) arg1_types = self.__ohe_arg1.transform(x.arg1_type) arg2_types = self.__ohe_arg2.transform(x.arg2_type) return hstack( [ dictionary_features, arg1_cover_features, arg2_cover_features, arg1_types, arg2_types ] )
@staticmethod def _binary_neg_present(sen: str): """Detects if negative words are present Parameters ---------- sen: str A sentence of natural language Returns ------- bool A boolean indicating if negative words are present """ return WordSentimentCounter("neg").transform([sen])[0][0] > 0 @staticmethod def _prepare_dictionary_features(data: dict): """Takes a dictionary and extracts relevant features Parameters ---------- data: a dictionary of relevant features Returns ------- dict: The new dictionary """ def get_features(f): new_feats = f.copy() for t, d in enumerate(new_feats): sent1 = nlp(d.get("arg1_covering_sentence")) sent2 = nlp(d.get("arg2_covering_sentence")) new_feats[t] = { "arg1_position": d.get("arg1_position"), "arg2_position": d.get("arg2_position"), 'arg1_preceding_tokens': d.get('arg1_preceding_tokens'), "arg1_following_tokens": d.get("arg1_following_tokens"), 'arg2_preceding_tokens': d.get('arg2_preceding_tokens'), "arg2_following_tokens": d.get("arg2_following_tokens"), "sentence_similarity_norm": sent1.similarity(sent2), "n_preceding_components": d.get("n_preceding_components"), "n_following_components": d.get("n_following_components"), "neg_present_arg1": StructureFeatures._binary_neg_present(sent1.text), "neg_present_arg2": StructureFeatures._binary_neg_present(sent2.text), } return new_feats return get_features(data)