Source code for pyate.combo_basic

# combo basic
import math
from typing import List, Mapping, Sequence

import pandas as pd
import numpy as np
from tqdm import tqdm

from .term_extraction import TermExtraction, add_term_extraction_method, Corpus


[docs]def helper_get_subsequences(s: str) -> List[str]: sequence = s.split() if len(sequence) <= 2: return [] answer = [] for left in range(len(sequence) + 1): for right in range(left + 1, len(sequence) + 1): if left == 0 and right == len(sequence): continue answer.append(" ".join(sequence[left:right])) return answer
[docs]@add_term_extraction_method def combo_basic( technical_corpus: Corpus, smoothing: float = 0.01, verbose: bool = False, have_single_word: bool = False, technical_counts: Mapping[str, int] = None, weights: Sequence[float] = None, ): if technical_counts is None: technical_counts = ( TermExtraction(technical_corpus) .count_terms_from_documents(verbose=verbose) .reindex() ) if len(technical_counts) == 0: return pd.Series(dtype=TermExtraction.config["dtype"]) order = sorted( list(technical_counts.keys()), key=TermExtraction.word_length, reverse=True ) if not have_single_word: order = list(filter(lambda s: TermExtraction.word_length(s) > 1, order)) technical_counts = technical_counts[order] if len(technical_counts) == 0: return pd.Series(dtype=TermExtraction.config["dtype"]) df = pd.DataFrame( { "xlogx_score": technical_counts.reset_index() .apply( lambda s: math.log(TermExtraction.word_length(s["index"])) * s[0], axis=1, ) .values, "times_subset": 0, "times_superset": 0, }, index=technical_counts.index, ) indices = set(technical_counts.index) iterator = tqdm(technical_counts.index) if verbose else technical_counts.index for index in iterator: for substring in helper_get_subsequences(index): if substring in indices: df.at[substring, "times_subset"] += 1 df.at[index, "times_superset"] += 1 if weights is None: weights = np.array([1, 0.75, 0.1]) return df.apply(lambda s: s.values.dot(weights), axis=1)
if __name__ == "__main__": corpus = "Hello I am a term extractor." print(TermExtraction(corpus).combo_basic().sort_values(ascending=False).head(50))