Source code for pyate.cvalues

# c_value
import math
from typing import List
from typing import Mapping

import pandas as pd
from tqdm import tqdm

from .term_extraction import add_term_extraction_method
from .term_extraction import Corpus
from .term_extraction import TermExtraction


[docs]def helper_get_subsequences(s: str) -> List[str]:
    sequence = s.split()
    if len(sequence) <= 2:
        return []
    answer = []
    for left in range(len(sequence) + 1):
        for right in range(left + 1, len(sequence) + 1):
            if left == 0 and right == len(sequence):
                continue
            answer.append(" ".join(sequence[left:right]))
    return answer


[docs]@add_term_extraction_method
def cvalues(
    technical_corpus: Corpus,
    smoothing: float = 0.01,
    verbose: bool = False,
    have_single_word: bool = False,
    technical_counts: Mapping[str, int] = None,
    threshold: float = 0,
):

    if technical_counts is None:
        technical_counts = (
            TermExtraction(technical_corpus)
            .count_terms_from_documents(verbose=verbose)
            .reindex()
        )

    order = sorted(
        list(technical_counts.keys()), key=TermExtraction.word_length, reverse=True
    )

    if not have_single_word:
        order = list(filter(lambda s: TermExtraction.word_length(s) > 1, order))

    technical_counts = technical_counts[order]

    df = pd.DataFrame(
        {
            "frequency": technical_counts.values,
            "times_nested": technical_counts.values,
            "number_of_nested": 1,
            "has_been_evaluated": False,
        },
        index=technical_counts.index,
    )

    # print(df)
    output = []
    indices = set(df.index)

    iterator = tqdm(df.iterrows()) if verbose else df.iterrows()

    for candidate, row in iterator:
        f, t, n, h = row
        length = TermExtraction.word_length(candidate)
        if length == TermExtraction.config["MAX_WORD_LENGTH"]:
            c_val = math.log2(length + smoothing) * f
        else:
            c_val = math.log2(length + smoothing) * f
            if h:
                c_val -= t / n
        if c_val >= threshold:
            output.append((candidate, c_val))
            for substring in helper_get_subsequences(candidate):
                if substring in indices:
                    df.loc[substring, "times_nested"] += 1
                    df.loc[substring, "number_of_nested"] += f
                    df.loc[substring, "has_been_evaluated"] = True

    srs = pd.Series(map(lambda s: s[1], output), index=map(lambda s: s[0], output))
    return srs.sort_values(ascending=False)


if __name__ == "__main__":
    corpus = "Hello, I am a term extractor."
    print(TermExtraction(corpus).cvalues(verbose=True))