Source code for pyate.term_extraction_pipeline
from collections import defaultdict
from typing import Callable
import pandas as pd
from spacy.tokens import Doc
from .combo_basic import combo_basic
from .term_extraction import TermExtraction
from spacy.matcher import Matcher
[docs]class TermExtractionPipeline:
"""
This is for adding PyATE as a spaCy pipeline component.
"""
def __init__(
self,
nlp,
func: Callable[..., pd.Series] = combo_basic,
force: bool = True,
*args,
**kwargs
) -> None:
"""
This is for initializing the TermExtractionPipeline.
"""
self.func = func
self.args = args
self.kwargs = kwargs
self.__name__ = self.func.__name__
self.matcher = Matcher(nlp.vocab)
Doc.set_extension(self.__name__, default=None, force=force)
self.term_counter = None
def add_to_counter(matcher, doc, i, matches) -> Doc:
match_id, start, end = matches[i]
candidate = str(doc[start:end])
if (
TermExtraction.word_length(candidate)
<= TermExtraction.config["MAX_WORD_LENGTH"]
):
self.term_counter[candidate] += 1
for i, pattern in enumerate(TermExtraction.patterns):
self.matcher.add("term{}".format(i), add_to_counter, pattern)
def __call__(self, doc: Doc):
"""
This function will be called from within spaCy's utilities.
"""
self.term_counter = defaultdict(int)
self.matcher(doc)
terms = self.func(
str(doc),
technical_counts=pd.Series(self.term_counter),
*self.args,
**self.kwargs
)
setattr(doc._, self.__name__, terms)
return doc