# -*- coding: utf-8 -*- #!/usr/bin/python3.5 -S import time import gensim import nltk import os, sys import logging import gensim from gensim import corpora import codecs from collections import defaultdict from pprint import pprint # pretty-printer from gensim import corpora, models, similarities # https://radimrehurek.com/gensim/apiref.html #pip install pyemd from gensim.models import word2vec from gensim.models import doc2vec import nltk, string from sklearn.feature_extraction.text import TfidfVectorizer #pip install --upgrade sklearn word2vecpath = "/home/100biere/demo/realdemo/Word2Vec.mod" doc2vecpath = "/home/100biere/demo/realdemo/Doc2Vec.mod" doc2vecpathDict = "/home/100biere/demo/realdemo/Doc2Vec.dict" doc2vecpathCorp = "/home/100biere/demo/realdemo/Doc2Vec.corp" stopword = "/home/100biere/demo/realdemo/stopwordlist.de.txt" def CalcCosineSimilarity(documents, markov_sentence): with codecs.open(stopword,'r', encoding='utf-8') as f: stopwords_tmp = f.read() f.close() stopwords = set(stopwords_tmp.strip().split()) stemmer = nltk.stem.porter.PorterStemmer() remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) def stem_tokens(tokens): return [stemmer.stem(item) for item in tokens] '''remove punctuation, lowercase, stem''' def normalize(text): return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map))) vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words=stopwords) def cosine_sim(text1, text2): tfidf = vectorizer.fit_transform([text1, text2]) return ((tfidf * tfidf.T).A)[0,1] print("CalcCosineSimilarity:"), print(cosine_sim(documents, markov_sentence)) return True def CalcDocSimilarity(documents, markov_sentence): if os.path.isfile(doc2vecpath): dictionary = corpora.Dictionary.load(doc2vecpathDict) #corpus = gensim.corpora.mmcorpus.MmCorpus.load(doc2vecpathCorp) #gensim.corpora.mmcorpus.MmCorpus.serialize(doc2vecpathCorp) corpus = gensim.corpora.mmcorpus.MmCorpus(doc2vecpathCorp) lsi = models.LsiModel.load(doc2vecpath) #lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=30, chunksize=100000) # index = similarities.MatrixSimilarity(lsi[corpus], num_features=len(corpus)) vec_bow1 = dictionary.doc2bow(markov_sentence.lower().split(), allow_update=False, return_missing=False) vec_lsi1 = lsi[vec_bow1] # sims1 = index[vec_lsi1] # perform a similarity query against the corpus # print("sim1 Wert:"), # pprint(index) # print(sims1) try: if vec_lsi1[0][1]: return vec_lsi1[0][1] else: return 0.0 except Exception: return 0.0 else: # wenn documents als plain text string übergeben wird docs = [documents] with codecs.open(stopword,'r', encoding='utf-8') as f: stopwords_tmp = f.read() f.close() stopwords = set(stopwords_tmp.strip().split()) texts = [[word for word in document.lower().split() if word not in stopwords] for document in docs] frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) dictionary.save(doc2vecpathDict) corpus = [dictionary.doc2bow(text) for text in texts] gensim.corpora.mmcorpus.MmCorpus.serialize(doc2vecpathCorp, corpus) #gensim.corpora.mmcorpus.MmCorpus.save(doc2vecpathCorp, corpus) #lda = gensim.models.LdaModel(corpus, id2word=dictionary, alpha='auto', num_topics=30, chunksize=100000, passes=2) lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=30, chunksize=100000) # index = similarities.MatrixSimilarity(lsi[corpus], num_features=len(corpus)) # # transform corpus to LSI space and index it lsi.save(doc2vecpath) vec_bow1 = dictionary.doc2bow(markov_sentence.lower().split()) vec_lsi1 = lsi[vec_bow1] #sims1 = index[vec_lsi1] # perform a similarity query against the corpus try: if vec_lsi1[0][1]: return vec_lsi1[0][1] else: return 0.0 except Exception: return 0.0 def CalcWordSimilarity(documents, markov_sentence, train=True): #https://radimrehurek.com/gensim/models/phrases.html#module-gensim.models.phrases # Gensim Phrases bilden Bi und Trigramme ab # # # n_similarity(ws1, ws2)¶ ###### score(sentences, total_sentences=1000000, chunksize=100, queue_factor=2, report_delay=1) --> https://radimrehurek.com/gensim/models/word2vec.html if train and os.path.isfile(word2vecpath): model = gensim.models.Word2Vec.load(word2vecpath) model = gensim.models.Word2Vec(documents, sg=1, hs=1, iter=150, size=800, workers=1, sorted_vocab=1, alpha=0.325, min_count=1) model.init_sims(replace=False) # can read,write from, and also training -> more memory #model.init_sims(replace=True) # can only read from, but no more training -> less memory model.save(word2vecpath) return True elif train and not os.path.isfile(word2vecpath): model = gensim.models.Word2Vec(documents, sg=1, hs=1, iter=150, size=800, workers=1, sorted_vocab=1, alpha=0.325, min_count=1) model.init_sims(replace=False) # can read,write from, and also training -> more memory #model.init_sims(replace=True) # can only read from, but no more training -> less memory model.save(word2vecpath) return True elif os.path.isfile(word2vecpath): model = gensim.models.Word2Vec.load(word2vecpath) else: model = gensim.models.Word2Vec(documents, sg=1, hs=1, iter=50, size=400, workers=1, sorted_vocab=1, alpha=0.325, min_count=1) model.init_sims(replace=False) # can read,write from, and also training -> more memory #model.init_sims(replace=True) # can only read from, but no more training -> less memory model.save(word2vecpath) #markov = [markov_sentence] docs = [markov_sentence] with codecs.open(stopword,'r', encoding='utf-8') as f: stopwords_tmp = f.read() f.close() stopwords = set(stopwords_tmp.strip().split()) texts = [[word for word in document.lower().split() if word not in stopwords] for document in docs] ''' print("Markov Score:"), calc = model.score([markov_sentence.split()]) print(calc) # documents print("Markov Score:"), calc = model.score([markov_sentence]) print(calc) print("Standard Sentence Score:"), standardSentence = u"Instagram Community trifft sich in Berlin und Frankfurt am Main - heute erklaeren wir euch, wie Ihr mehr Follower und mehr Fotos und Videos teilen koennt." calc = model.score([standardSentence]) #calc = similarity print(calc) print("Direkt Corpus Sentence Score:"), sce = u"Instagram aehnelt in vielerlei Hinsicht Twitter, unter anderem weil es wie der Kurznachrichtendienst die Moeglichkeit bietet, in Echtzeit mit der Community zu kommunizieren." calc = model.score([sce]) #print("Syntax Calc:"), print(calc) ''' print("Markov Text Score 1:"), calc = model.score([markov_sentence]) print(calc) # documents print("Documents Score 2:"), calc = model.score([documents.split()]) print(calc) calc = model.n_similarity(documents, markov_sentence) print("Markov Sentence Similiarity Score:"), print(calc) ''' standardSentence = u"Instagram Community trifft sich in Berlin und Frankfurt am Main - heute erklaeren wir euch, wie Ihr mehr Follower und mehr Fotos und Videos teilen koennt." calc = model.n_similarity(documents, standardSentence) print("Standard Sentence Similiarity Score:"), print(calc) #sce = u"Instagram aehnelt in vielerlei Hinsicht Twitter, unter anderem weil es wie der Kurznachrichtendienst die Moeglichkeit bietet, in Echtzeit mit der Community zu kommunizieren." sce = u"Biologisch betrachtet ist die DNA essentiell." calc = model.n_similarity(documents, sce) print("Direkt Corpus Sentence Similiarity Score:"), print(calc) ''' # wmdistance Score von 0.0 entspricht identischen Dokumenten distance = model.wmdistance(documents, markov_sentence) print("Distance wmd Score:"), print(distance) return True