Source code for pyate.term_extraction_pipeline

from collections import defaultdict
from typing import Callable

import pandas as pd
from spacy.tokens import Doc

from .combo_basic import combo_basic
from .term_extraction import TermExtraction
from spacy.matcher import Matcher


[docs]class TermExtractionPipeline:
    """
    This is for adding PyATE as a spaCy pipeline component.
    """
    def __init__(
        self,
        nlp,
        func: Callable[..., pd.Series] = combo_basic,
        force: bool = True,
        *args,
        **kwargs
    ) -> None:
        """
        This is for initializing the TermExtractionPipeline.
        """
        self.func = func
        self.args = args
        self.kwargs = kwargs
        self.__name__ = self.func.__name__
        self.matcher = Matcher(nlp.vocab)
        Doc.set_extension(self.__name__, default=None, force=force)
        self.term_counter = None

        def add_to_counter(matcher, doc, i, matches) -> Doc:
            match_id, start, end = matches[i]
            candidate = str(doc[start:end])
            if (
                TermExtraction.word_length(candidate)
                <= TermExtraction.config["MAX_WORD_LENGTH"]
            ):
                self.term_counter[candidate] += 1

        for i, pattern in enumerate(TermExtraction.patterns):
            self.matcher.add("term{}".format(i), add_to_counter, pattern)

    def __call__(self, doc: Doc):
        """
        This function will be called from within spaCy's utilities.
        """
        self.term_counter = defaultdict(int)
        self.matcher(doc)
        terms = self.func(
            str(doc),
            technical_counts=pd.Series(self.term_counter),
            *self.args,
            **self.kwargs
        )
        setattr(doc._, self.__name__, terms)
        return doc