Source code for pyate.weirdness

# weirdness.py

from typing import Mapping
import pandas as pd

from .term_extraction import TermExtraction, add_term_extraction_method, Corpus


[docs]@add_term_extraction_method def weirdness( technical_corpus: Corpus, general_corpus: Corpus = None, general_corpus_size=TermExtraction.config["DEFAULT_GENERAL_DOMAIN_SIZE"], normalized: bool = False, technical_counts: Mapping[str, int] = None, verbose: bool = False, ) -> pd.Series: # http://ceur-ws.org/Vol-1031/paper3.pdf if general_corpus is None: general_corpus = TermExtraction.get_general_domain() if technical_counts is None: # this is the bulk of the calculations technical_counts = TermExtraction(technical_corpus).count_terms_from_documents( verbose=verbose ) general_counts = TermExtraction( general_corpus, technical_counts.index ).count_terms_from_documents() technical_word_count = TermExtraction.word_length("\n".join(technical_corpus)) general_word_count = TermExtraction.word_length("\n".join(general_corpus)) zero_division_preventer = pd.Series(index=technical_counts.index, data=1) general_counts = zero_division_preventer.add(general_counts, fill_value=0) if normalized: return ( technical_counts * general_word_count / general_counts / technical_word_count ) else: return technical_counts / general_counts
if __name__ == "__main__": PATH_TO_GENERAL_DOMAIN = "../data/wiki_testing.pkl" PATH_TO_TECHNICAL_DOMAIN = "../data/pmc_testing.pkl" wiki = pd.read_pickle(PATH_TO_GENERAL_DOMAIN) pmc = pd.read_pickle(PATH_TO_TECHNICAL_DOMAIN) pairdf = weirdness(pmc[:200], wiki[:500]) print(pairdf.sort_values(ascending=False).head(50))