import nltk
import pandas
from scipy.sparse import hstack
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import LabelBinarizer, MaxAbsScaler
from sklearn.svm import SVC
from ..argument_pipeline.base import Model
from ..nlp import PosDistribution
from ..nlp._utils import spacy_download, nltk_download
from ..nlp.transformers import DiscourseMatcher, SharedNouns
from ..utils import logger
nltk_download('punkt')
_nlp = spacy_download(disable=['ner', 'textcat', 'tagger', 'lemmatizer', 'tokenizer',
'attribute_ruler',
'benepar'])
__all__ = [
"LinkPredictor",
"LinkFeatures"
]
[docs]class LinkPredictor(Model):
"""Prediction model which can predict if two argument components are "linked"."""
def __init__(self, model_id=None):
if model_id is None:
model_id = "link_predictor"
super().__init__(model_id)
[docs] @staticmethod
def default_train():
"""Default training method
Returns
--------
Tuple
training data, test data, training targets, test targets
"""
from canary.corpora import load_essay_corpus
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
ros = RandomOverSampler(random_state=0, sampling_strategy='not majority')
x, y = load_essay_corpus(purpose='link_prediction')
x, y = ros.fit_resample(pandas.DataFrame(x), pandas.DataFrame(y))
train_data, test_data, train_targets, test_targets = \
train_test_split(x, y,
train_size=0.6,
shuffle=True,
random_state=0,
)
logger.debug("Resample")
return list(train_data.to_dict("index").values()), list(test_data.to_dict("index").values()), train_targets[
0].tolist(), test_targets[0].tolist()
[docs] @classmethod
def train(cls, pipeline_model=None, train_data=None, test_data=None, train_targets=None, test_targets=None,
save_on_finish=True, *args, **kwargs):
if pipeline_model is None:
pipeline_model = make_pipeline(
LinkFeatures(),
MaxAbsScaler(),
SVC(random_state=0, probability=True, C=10),
)
return super().train(pipeline_model, train_data, test_data, train_targets, test_targets, save_on_finish,
*args,
**kwargs)
[docs]class LinkFeatures(TransformerMixin, BaseEstimator):
"""Transformer which handles LinkPredictor features"""
feats: list = [
DiscourseMatcher('forward'),
DiscourseMatcher('thesis'),
DiscourseMatcher('rebuttal'),
DiscourseMatcher('backward'),
]
def __init__(self):
self.__nom_dict_features = DictVectorizer()
self.__numeric_dict_features = DictVectorizer()
self.__arg1_cover_features = make_union(*LinkFeatures.feats)
self.__arg2_cover_features = make_union(*LinkFeatures.feats)
self.__ohe_arg1 = LabelBinarizer()
self.__ohe_arg2 = LabelBinarizer()
[docs] def fit(self, x, y=None):
"""Fits self to data provided.
Parameters
----------
x: list
A list of datapoints which are to be transformed using the mixin
Returns
-------
scipy.sparse.hstack
The features of the inputted list
See Also
---------
scipy.sparse.hstack
"""
logger.debug("fitting...")
px = pandas.DataFrame(x)
self.__arg1_cover_features.fit(px.arg1_covering_sentence.tolist())
self.__arg2_cover_features.fit(px.arg2_covering_sentence.tolist())
self.__numeric_dict_features.fit(self._prepare_numeric_feats(x))
self.__nom_dict_features.fit(self._prepare_dictionary_features(x))
self.__ohe_arg1.fit(["Premise", "Claim", "MajorClaim"])
self.__ohe_arg2.fit(["Premise", "Claim", "MajorClaim"])
return self
@staticmethod
def _prepare_dictionary_features(data):
shared_noun_counter = SharedNouns()
def get_features(feats):
new_feats = feats.copy()
for t, f in enumerate(new_feats):
n_shared_nouns = shared_noun_counter.transform(f.get("arg1_component"), f.get("arg2_component"))
features = {
"source_before_target": f.get("source_before_target"),
"arg1_first_in_paragraph": f.get("arg1_first_in_paragraph"),
"arg1_last_in_paragraph": f.get("arg1_last_in_paragraph"),
"arg2_first_in_paragraph": f.get("arg2_first_in_paragraph"),
"arg2_last_in_paragraph": f.get("arg2_last_in_paragraph"),
"arg1_is_premise": f.get("arg1_type") == "Premise",
"arg1_in_intro": f.get("arg1_in_intro"),
"arg1_in_conclusion": f.get("arg1_in_conclusion"),
"arg2_in_intro": f.get("arg2_in_intro"),
"arg2_in_conclusion": f.get("arg2_in_conclusion"),
"arg1_and_arg2_in_same_sentence": f.get("arg1_and_arg2_in_same_sentence"),
"arg1_indicator_type_follows_component": f.get("arg1_indicator_type_follows_component"),
"arg2_indicator_type_follows_component": f.get("arg2_indicator_type_follows_component"),
"arg1_indicator_type_precedes_component": f.get("arg1_indicator_type_precedes_component"),
"arg2_indicator_type_precedes_component": f.get("arg2_indicator_type_precedes_component"),
"share_nouns": n_shared_nouns > 0,
}
new_feats[t] = features
return new_feats
return get_features(data)
@staticmethod
def _prepare_numeric_feats(data):
shared_noun_counter = SharedNouns()
def get_features(feats):
pos_dist = PosDistribution()
arg1_covering_sentence = pandas.DataFrame(feats).arg1_covering_sentence.tolist()
arg1_covering_sentence = list(_nlp.pipe(arg1_covering_sentence))
arg2_covering_sentence = pandas.DataFrame(feats).arg2_covering_sentence.tolist()
arg2_covering_sentence = list(_nlp.pipe(arg2_covering_sentence))
new_feats = feats.copy()
for t, f in enumerate(new_feats):
n_shared_nouns = shared_noun_counter.transform(f.get("arg1_component"), f.get("arg2_component"))
features = {
"n_para_components": f.get('n_para_components'),
"n_components_between_pair": abs(f.get("arg2_position") - f.get("arg1_position")),
"arg1_component_token_len": len(nltk.word_tokenize(f['arg1_component'])),
"arg2_component_token_len": len(nltk.word_tokenize(f['arg2_component'])),
"arg1_cover_sen_token_len": len(nltk.word_tokenize(f['arg1_covering_sentence'])),
"arg2_cover_sen_token_len": len(nltk.word_tokenize(f['arg2_covering_sentence'])),
"shared_nouns": n_shared_nouns,
"similarity": arg1_covering_sentence[t].similarity(arg2_covering_sentence[t])
}
arg1_posd = {"arg1_" + str(key): val for key, val in pos_dist(f['arg1_covering_sentence']).items()}
arg2_posd = {"arg2_" + str(key): val for key, val in pos_dist(f['arg2_covering_sentence']).items()}
features.update(arg1_posd)
features.update(arg2_posd)
new_feats[t] = features
return new_feats
return get_features(data)