# -*- coding: utf-8 -*- #!/usr/bin/python3.5 -S import time import gensim import nltk import os, sys import logging import gensim from gensim import corpora import codecs from collections import defaultdict from pprint import pprint # pretty-printer from gensim import corpora, models, similarities # https://radimrehurek.com/gensim/apiref.html #pip install pyemd from gensim.models import word2vec from gensim.models import doc2vec word2vecpath = "/home/100biere/demo/realdemo/Word2Vec.mod" doc2vecpath = "/home/100biere/demo/realdemo/Doc2Vec.mod" doc2vecpathDict = "/home/100biere/demo/realdemo/Doc2Vec.dict" doc2vecpathCorp = "/home/100biere/demo/realdemo/Doc2Vec.corp" stopword = "/home/100biere/demo/realdemo/stopwordlist.de.txt" def CalcWordSimilarity(documents, markov_sentence, train=True): #https://radimrehurek.com/gensim/models/phrases.html#module-gensim.models.phrases # Gensim Phrases bilden Bi und Trigramme ab # # # n_similarity(ws1, ws2)¶ ###### score(sentences, total_sentences=1000000, chunksize=100, queue_factor=2, report_delay=1) --> https://radimrehurek.com/gensim/models/word2vec.html if train and os.path.isfile(word2vecpath): model = gensim.models.Word2Vec.load(word2vecpath) model = gensim.models.Word2Vec(documents, sg=1, hs=1, iter=150, size=800, workers=1, sorted_vocab=1, alpha=0.325, min_count=1) model.init_sims(replace=False) # can read,write from, and also training -> more memory #model.init_sims(replace=True) # can only read from, but no more training -> less memory model.save(word2vecpath) return True elif train and not os.path.isfile(word2vecpath): model = gensim.models.Word2Vec(documents, sg=1, hs=1, iter=150, size=800, workers=1, sorted_vocab=1, alpha=0.325, min_count=1) model.init_sims(replace=False) # can read,write from, and also training -> more memory #model.init_sims(replace=True) # can only read from, but no more training -> less memory model.save(word2vecpath) return True elif os.path.isfile(word2vecpath): model = gensim.models.Word2Vec.load(word2vecpath) else: model = gensim.models.Word2Vec(documents, sg=1, hs=1, iter=50, size=400, workers=1, sorted_vocab=1, alpha=0.325, min_count=1) model.init_sims(replace=False) # can read,write from, and also training -> more memory #model.init_sims(replace=True) # can only read from, but no more training -> less memory model.save(word2vecpath) #markov = [markov_sentence] docs = [markov_sentence] with codecs.open(stopword,'r', encoding='utf-8') as f: stopwords_tmp = f.read() f.close() stopwords = set() texts = [[word for word in document.lower().split() if word not in stopwords] for document in docs] ''' print("Markov Score:"), calc = model.score([markov_sentence.split()]) print(calc) # documents print("Markov Score:"), calc = model.score([markov_sentence]) print(calc) print("Standard Sentence Score:"), standardSentence = u"Instagram Community trifft sich in Berlin und Frankfurt am Main - heute erklaeren wir euch, wie Ihr mehr Follower und mehr Fotos und Videos teilen koennt." calc = model.score([standardSentence]) #calc = similarity print(calc) print("Direkt Corpus Sentence Score:"), sce = u"Instagram aehnelt in vielerlei Hinsicht Twitter, unter anderem weil es wie der Kurznachrichtendienst die Moeglichkeit bietet, in Echtzeit mit der Community zu kommunizieren." calc = model.score([sce]) #print("Syntax Calc:"), print(calc) ''' calc = model.n_similarity(documents, markov_sentence) print("Markov Sentence Similiarity Score:"), print(calc) standardSentence = u"Instagram Community trifft sich in Berlin und Frankfurt am Main - heute erklaeren wir euch, wie Ihr mehr Follower und mehr Fotos und Videos teilen koennt." calc = model.n_similarity(documents, standardSentence) print("Standard Sentence Similiarity Score:"), print(calc) #sce = u"Instagram aehnelt in vielerlei Hinsicht Twitter, unter anderem weil es wie der Kurznachrichtendienst die Moeglichkeit bietet, in Echtzeit mit der Community zu kommunizieren." sce = u"Biologisch betrachtet ist die DNA essentiell." calc = model.n_similarity(documents, sce) print("Direkt Corpus Sentence Similiarity Score:"), print(calc) # wmdistance Score von 0.0 entspricht identischen Dokumenten distance = model.wmdistance(documents, "Kuehlschraenke sind Kuechengeraete und werden taeglich benoetigt.") print("Distance wmd Score:"), print(distance) return True def CalcDocSimilarity(documents, markov_sentence): if os.path.isfile(doc2vecpath): dictionary = corpora.Dictionary.load(doc2vecpathDict) #corpus = gensim.corpora.mmcorpus.MmCorpus.load(doc2vecpathCorp) #gensim.corpora.mmcorpus.MmCorpus.serialize(doc2vecpathCorp) corpus = gensim.corpora.mmcorpus.MmCorpus(doc2vecpathCorp) lsi = models.LsiModel.load(doc2vecpath) #lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=30, chunksize=100000) # index = similarities.MatrixSimilarity(lsi[corpus], num_features=len(corpus)) vec_bow1 = dictionary.doc2bow(markov_sentence.lower().split(), allow_update=False, return_missing=False) vec_lsi1 = lsi[vec_bow1] # sims1 = index[vec_lsi1] # perform a similarity query against the corpus # print("sim1 Wert:"), # pprint(index) # print(sims1) try: if vec_lsi1[0][1]: return vec_lsi1[0][1] else: return 0.0 except Exception: return 0.0 else: # wenn documents als plain text string übergeben wird docs = [documents] with codecs.open(stopword,'r', encoding='utf-8') as f: stopwords_tmp = f.read() f.close() stopwords = set(stopwords_tmp.strip().split()) texts = [[word for word in document.lower().split() if word not in stopwords] for document in docs] frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) dictionary.save(doc2vecpathDict) corpus = [dictionary.doc2bow(text) for text in texts] gensim.corpora.mmcorpus.MmCorpus.serialize(doc2vecpathCorp, corpus) #gensim.corpora.mmcorpus.MmCorpus.save(doc2vecpathCorp, corpus) #lda = gensim.models.LdaModel(corpus, id2word=dictionary, alpha='auto', num_topics=30, chunksize=100000, passes=2) lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=30, chunksize=100000) # index = similarities.MatrixSimilarity(lsi[corpus], num_features=len(corpus)) # # transform corpus to LSI space and index it lsi.save(doc2vecpath) vec_bow1 = dictionary.doc2bow(markov_sentence.lower().split()) vec_lsi1 = lsi[vec_bow1] #sims1 = index[vec_lsi1] # perform a similarity query against the corpus try: if vec_lsi1[0][1]: return vec_lsi1[0][1] else: return 0.0 except Exception: return 0.0 def CalcDocSimilareeeeity(documents): raw_sentences = [] #tokens = nltk.word_tokenize(text) text_sent = nltk.sent_tokenize(documents) for a in text_sent: words = nltk.word_tokenize(a) #pprint.pprint(words), #print(type(a)) raw_sentences.append(words) sentences = [gensim.models.doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(raw_sentences)] if os.path.isfile(doc2vecpath): model = gensim.models.Doc2Vec.load(doc2vecpath) model = gensim.models.Doc2Vec(iter=30, size=400, workers=4, sorted_vocab=1, alpha=0.75, min_alpha=0.45) #min_count=1 model.sort_vocab() model.build_vocab(sentences) #for epoch in range(30): # model.train(sentences) model.init_sims(replace=False) # can read,write from, and also training -> more memory #model.init_sims(replace=True) # can only read from, but no more training -> less memory model.save(doc2vecpath) # vocabeln im Doc2Vec ###wordVocab = [k for (k, v) in model.vocab.iteritems()] #pprint.pprint(wordVocab) #sys.exit(0) #try: #sim = model.most_similar(positive=["Instagram"], negative=[], topn=10, restrict_vocab=None, indexer=None) ####sim = model.most_similar_cosmul(positive=[standardSentence], negative=[], topn=10) #sim = model.score(standardSentence, total_sentences=100, chunksize=100, queue_factor=2, report_delay=1) #sim_word = sim[0] #sim_calc = sim_word[1] #distance = model.wmdistance(text_markov, standardSentence) #für alle sätze im corpus, den check gegen den Markov erstellten Satz machen #model = word2vec.Word2Vec(text, size=200) stopword = "/home/100biere/demo/realdemo/stopwordlist.de.txt" with codecs.open(stopword,'r', encoding='utf-8') as f: stopwords_tmp = f.read() f.close() stopwords = set(stopwords_tmp.strip().split()) #texts = [[word for word in documents.lower().split() if word not in stopwords] for document in documents] texts = [word for word in documents.lower().split() if word not in stopwords] frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts_final = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts_final) dictionary.save('/home/100biere/demo/realdemo/demo.dict') corpus = [dictionary.doc2bow(text) for text in texts_final] lda = gensim.models.LdaMulticore(corpus, num_topics=25, chunksize=1000, passes=2) # train asymmetric alpha from data #lda.bound(corpus) standardSentence = u"Instagram - heute erklaeren wir euch, wie Ihr mehr Follower und mehr Fotos und Videos teilen koennt." text_markov = u"Ich schaue Instagram und gucken, ob das video abgespielt wird" vec_bow1 = dictionary.doc2bow(standardSentence.lower().split()) vec_bow2 = dictionary.doc2bow(text_markov.lower().split()) #print(lda[vec_bow1]) # get topic probability distribution for a document #print(lda[vec_bow2]) #v = lda.log_perplexity(vec_bow1, total_docs=10) #print (lda.show_topics(num_topics=10, num_words=10, log=False, formatted=True)) print('Perplexity Markov: '), perplex1 = lda.bound(vec_bow2) print (perplex1) print(lda[vec_bow1]) print('Perplexity Standard: '), print(lda[vec_bow2]) perplex2 = lda.bound(vec_bow1) print (perplex2) ###lda.update(corpus2) # update the LDA model with additional documents #print(lda[text_markov]) # text_markov, standardSentence #print("Similiarity:"), #print(sim) #print (model["Instagram"]) #except KeyError: # 1