Source code for pyate.term_extractor

# c_value

import math
from typing import Mapping, Sequence
import pandas as pd
import numpy as np

from .term_extraction import TermExtraction, add_term_extraction_method, Corpus


[docs]@add_term_extraction_method
def term_extractor(
    technical_corpus: Corpus,
    general_corpus: Corpus = None,
    general_corpus_size=TermExtraction.config["DEFAULT_GENERAL_DOMAIN_SIZE"],
    weights: Sequence[float] = None,
    verbose: bool = False,
    technical_counts: Mapping[str, int] = None,
):

    if general_corpus is None:
        general_corpus = TermExtraction.get_general_domain()

    # reused initializations
    technical_counts_seperate = TermExtraction(
        technical_corpus
    ).count_terms_from_documents(True, verbose=verbose)
    if type(technical_counts_seperate) is pd.DataFrame:
        technical_counts = technical_counts_seperate.sum(axis=1)
    else:
        technical_counts = technical_counts_seperate
    XLOGX = lambda x: x * math.log(x) if x else 0

    # domain pertinence
    general_counts = TermExtraction(
        general_corpus, vocab=technical_counts.index
    ).count_terms_from_documents(verbose=verbose)
    general_counts /= general_counts.max()
    domain_pertinence = pd.DataFrame(
        data={
            "technical": technical_counts / technical_counts.sum(),
            "general": general_counts,
        }
    ).fillna(0)

    def domain_pertinence_function(s):
        tech, gen = s.iloc
        return tech / max(tech, gen)

    domain_pertinence = domain_pertinence.apply(domain_pertinence_function, axis=1)

    # domain consensus
    domain_consensus = technical_counts_seperate
    if type(technical_counts_seperate) is pd.DataFrame:
        domain_consensus = domain_consensus.div(domain_consensus.sum(axis=0), axis=1)
        domain_consensus = domain_consensus.applymap(lambda x: -XLOGX(x))
        domain_consensus = domain_consensus.sum(axis=1)
    else:
        domain_consensus = domain_consensus.apply(lambda x: -XLOGX(x))

    # Lexical cohesion
    term_words = set(
        word for term in technical_counts_seperate.index for word in term.split()
    )
    term_counts = TermExtraction(
        technical_corpus, term_words
    ).count_terms_from_documents()
    lexical_cohesion = technical_counts

    def lexical_cohesion_function(row):
        word, freq = row.iloc
        return (
            TermExtraction.word_length(word)
            * XLOGX(freq)  # remove plus 1 later
            / sum(map(lambda s: term_counts.loc[s], word.split()))
        )

    lexical_cohesion = pd.Series(
        lexical_cohesion.reset_index().apply(lexical_cohesion_function, axis=1).values,
        index=lexical_cohesion.index,
    )

    # print(
    #         domain_pertinence.sort_values(ascending=False),
    #         "\n",
    #         domain_consensus.sort_values(ascending=False),
    #         "\n",
    #         lexical_cohesion.sort_values(ascending=False),
    #     )
    # print('lexical cohesion:', lexical_cohesion)

    domain_pertinence /= domain_pertinence.max()
    domain_consensus /= domain_consensus.max()
    lexical_cohesion /= lexical_cohesion.max()

    df = pd.DataFrame(
        data={
            "domain_pertinence": domain_pertinence,
            "domain_consensus": domain_consensus,
            "lexical_cohesion": lexical_cohesion,
        }
    ).fillna(0)

    if verbose:
        print(
            domain_pertinence.sort_values(ascending=False),
            "\n",
            domain_consensus.sort_values(ascending=False),
            "\n",
            lexical_cohesion.sort_values(ascending=False),
        )
    if weights is None:
        weights = np.array([1, 1, 1]) / 3
    return df.dot(weights)


if __name__ == "__main__":
    PATH_TO_GENERAL_DOMAIN = "../data/wiki_testing.pkl"
    PATH_TO_TECHNICAL_DOMAIN = "../data/pmc_testing.pkl"
    wiki = pd.read_pickle(PATH_TO_GENERAL_DOMAIN)
    pmc = pd.read_pickle(PATH_TO_TECHNICAL_DOMAIN)
    # term_extractor(pmc[:50], wiki[:250])

    # print(term_extractor(pmc, wiki).sort_values(ascending=False).head(50))
    print(
        term_extractor(pmc[:50], wiki[:1000], verbose=True)
        .sort_values(ascending=False)
        .head(50)
    )
    # print(pmc[0])
    # Syntactic Analysis: (PP NP PP CNP RVP NP PP)
    # POS: (PAJNNN AJN PNCNNWVANPJN)