# -*- coding: utf-8 -*- #!/usr/bin/python3.5 -S import logging from gensim import corpora import codecs from collections import defaultdict from pprint import pprint # pretty-printer logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) def calculateSimilarity(document): stopword = "/home/100biere/demo/stopwordlist.de.txt" with codecs.open(stopword,'r', encoding='utf-8') as f: stopwords_tmp = f.read() f.close() stopwords = set(stopwords_tmp.split("\n")) texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] pprint(texts) def get_similarity_scores_all_pairs(texts): """Takes a list of strings as input and returns a matrix of cosine similarity values where element [m][n] represents the similarity between text m and text n. >>> get_similarity_scores_all_pairs(['apple banana', 'banana cherry']) array([[ 1., 0.], [ 0., 1.]]) """ n = len(texts) all_similarities = numpy.empty(shape=(n, n)) similarity_index = __get_tfidf_similarity_index(texts) for i in range(n): all_similarities[i] = similarity_index.similarity_by_id(i) return all_similarities def __get_tfidf_similarity_index(texts): """Takes a list of strings as input. Returns a gensim.Similarity object for calculating cosine similarities.""" texts_tokenized = [__tokenize_text(text) for text in texts] logging.debug('Creating corpora dictionary...') corpora_dict = corpora.Dictionary(texts_tokenized) logging.debug('Done creating corpora dictionary.') # gensim has us convert tokens to numeric IDs using corpora.Dictionary corpus = [corpora_dict.doc2bow(text_tokenized) for text_tokenized in texts_tokenized] corpus_tfidf = models.TfidfModel(corpus, normalize=True)[corpus] # Feed corpus back into its own model to get the TF-IDF values for the texts logging.debug('Creating Similarity index...') index = Similarity(None, corpus_tfidf, num_features=len(corpora_dict)) logging.debug('Done creating Similarity index.') return index def __tokenize_text(text): """Convert text to lowercase, replace periods and commas, and split it into a list. >>> __tokenize_text('hi. I am, a, sentence.') ['hi', 'i', 'am', 'a', 'sentence'] """ return text.lower().replace(',', '').replace('.', '').split()