Source code for pyate.term_extractor

# c_value

import math
from typing import Mapping, Sequence
import pandas as pd
import numpy as np

from .term_extraction import TermExtraction, add_term_extraction_method, Corpus


[docs]@add_term_extraction_method def term_extractor( technical_corpus: Corpus, general_corpus: Corpus = None, general_corpus_size=TermExtraction.config["DEFAULT_GENERAL_DOMAIN_SIZE"], weights: Sequence[float] = None, verbose: bool = False, technical_counts: Mapping[str, int] = None, ): if general_corpus is None: general_corpus = TermExtraction.get_general_domain() # reused initializations technical_counts_seperate = TermExtraction( technical_corpus ).count_terms_from_documents(True, verbose=verbose) if type(technical_counts_seperate) is pd.DataFrame: technical_counts = technical_counts_seperate.sum(axis=1) else: technical_counts = technical_counts_seperate XLOGX = lambda x: x * math.log(x) if x else 0 # domain pertinence general_counts = TermExtraction( general_corpus, vocab=technical_counts.index ).count_terms_from_documents(verbose=verbose) general_counts /= general_counts.max() domain_pertinence = pd.DataFrame( data={ "technical": technical_counts / technical_counts.sum(), "general": general_counts, } ).fillna(0) def domain_pertinence_function(s): tech, gen = s.iloc return tech / max(tech, gen) domain_pertinence = domain_pertinence.apply(domain_pertinence_function, axis=1) # domain consensus domain_consensus = technical_counts_seperate if type(technical_counts_seperate) is pd.DataFrame: domain_consensus = domain_consensus.div(domain_consensus.sum(axis=0), axis=1) domain_consensus = domain_consensus.applymap(lambda x: -XLOGX(x)) domain_consensus = domain_consensus.sum(axis=1) else: domain_consensus = domain_consensus.apply(lambda x: -XLOGX(x)) # Lexical cohesion term_words = set( word for term in technical_counts_seperate.index for word in term.split() ) term_counts = TermExtraction( technical_corpus, term_words ).count_terms_from_documents() lexical_cohesion = technical_counts def lexical_cohesion_function(row): word, freq = row.iloc return ( TermExtraction.word_length(word) * XLOGX(freq) # remove plus 1 later / sum(map(lambda s: term_counts.loc[s], word.split())) ) lexical_cohesion = pd.Series( lexical_cohesion.reset_index().apply(lexical_cohesion_function, axis=1).values, index=lexical_cohesion.index, ) # print( # domain_pertinence.sort_values(ascending=False), # "\n", # domain_consensus.sort_values(ascending=False), # "\n", # lexical_cohesion.sort_values(ascending=False), # ) # print('lexical cohesion:', lexical_cohesion) domain_pertinence /= domain_pertinence.max() domain_consensus /= domain_consensus.max() lexical_cohesion /= lexical_cohesion.max() df = pd.DataFrame( data={ "domain_pertinence": domain_pertinence, "domain_consensus": domain_consensus, "lexical_cohesion": lexical_cohesion, } ).fillna(0) if verbose: print( domain_pertinence.sort_values(ascending=False), "\n", domain_consensus.sort_values(ascending=False), "\n", lexical_cohesion.sort_values(ascending=False), ) if weights is None: weights = np.array([1, 1, 1]) / 3 return df.dot(weights)
if __name__ == "__main__": PATH_TO_GENERAL_DOMAIN = "../data/wiki_testing.pkl" PATH_TO_TECHNICAL_DOMAIN = "../data/pmc_testing.pkl" wiki = pd.read_pickle(PATH_TO_GENERAL_DOMAIN) pmc = pd.read_pickle(PATH_TO_TECHNICAL_DOMAIN) # term_extractor(pmc[:50], wiki[:250]) # print(term_extractor(pmc, wiki).sort_values(ascending=False).head(50)) print( term_extractor(pmc[:50], wiki[:1000], verbose=True) .sort_values(ascending=False) .head(50) ) # print(pmc[0]) # Syntactic Analysis: (PP NP PP CNP RVP NP PP) # POS: (PAJNNN AJN PNCNNWVANPJN)