# -*- coding: utf-8 -*- #!/usr/bin/python3.5 -S import logging import gensim from gensim import corpora import codecs from collections import defaultdict from pprint import pprint # pretty-printer from gensim import corpora, models, similarities #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) def calculateSimilarity(documents): stopword = "/home/100biere/demo/stopwordlist.de.txt" with codecs.open(stopword,'r', encoding='utf-8') as f: stopwords_tmp = f.read() f.close() stopwords = set(stopwords_tmp.strip().split()) #texts = [[word for word in documents.lower().split() if word not in stopwords] for document in documents] texts = [word for word in documents.lower().split() if word not in stopwords] frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts_final = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts_final) dictionary.save('/home/100biere/demo/demo.dict') corpus = [dictionary.doc2bow(text) for text in texts_final] index = Similarity('/home/100biere/demo/demo.sim', corpus, num_features=12) # build the index similarities = index["Instagram"] # get similarities between the query and all index documents pprint(similarities) sys.exit(0) corpora.MmCorpus.serialize('/home/100biere/demo/demo.dict.mm', corpus) # store to disk, for later use lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2) new_doc = "Instagram" vec_bow = dictionary.doc2bow(new_doc.lower().split()) vec_lsi = lsi[vec_bow] # convert the query to LSI space index = similarities.MatrixSimilarity(lsi[corpus]) #index.save('/tmp/deerwester.index') #index = similarities.MatrixSimilarity.load('/tmp/deerwester.index') sims = index[vec_lsi] #print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples sims = sorted(enumerate(sims), key=lambda item: -item[1]) print(lsi[vec_bow]) # print sorted (document number, similarity score) 2-tuples def get_similarity_scores_all_pairs(texts): """Takes a list of strings as input and returns a matrix of cosine similarity values where element [m][n] represents the similarity between text m and text n. >>> get_similarity_scores_all_pairs(['apple banana', 'banana cherry']) array([[ 1., 0.], [ 0., 1.]]) """ n = len(texts) all_similarities = numpy.empty(shape=(n, n)) similarity_index = __get_tfidf_similarity_index(texts) for i in range(n): all_similarities[i] = similarity_index.similarity_by_id(i) return all_similarities def __get_tfidf_similarity_index(texts): """Takes a list of strings as input. Returns a gensim.Similarity object for calculating cosine similarities.""" texts_tokenized = [__tokenize_text(text) for text in texts] logging.debug('Creating corpora dictionary...') corpora_dict = corpora.Dictionary(texts_tokenized) logging.debug('Done creating corpora dictionary.') # gensim has us convert tokens to numeric IDs using corpora.Dictionary corpus = [corpora_dict.doc2bow(text_tokenized) for text_tokenized in texts_tokenized] corpus_tfidf = models.TfidfModel(corpus, normalize=True)[corpus] # Feed corpus back into its own model to get the TF-IDF values for the texts logging.debug('Creating Similarity index...') index = Similarity(None, corpus_tfidf, num_features=len(corpora_dict)) logging.debug('Done creating Similarity index.') return index def __tokenize_text(text): """Convert text to lowercase, replace periods and commas, and split it into a list. >>> __tokenize_text('hi. I am, a, sentence.') ['hi', 'i', 'am', 'a', 'sentence'] """ return text.lower().replace(',', '').replace('.', '').split()