# -*- coding: utf-8 -*- #!/usr/bin/python3.5 -S import time import gensim import nltk import os, sys import logging import gensim from gensim import corpora import codecs from collections import defaultdict from pprint import pprint # pretty-printer from gensim import corpora, models, similarities # https://radimrehurek.com/gensim/apiref.html #pip install pyemd from gensim.models import word2vec from gensim.models import doc2vec doc2vecpath = "/home/100biere/demo/Doc2Vec.mod" doc2vecpathDict = "/home/100biere/demo/Doc2Vec.dict" doc2vecpathCorp = "/home/100biere/demo/Doc2Vec.corp" stopword = "/home/100biere/demo/stopwordlist.de.txt" def CalcDocSimilarity(documents, markov_sentence): if os.path.isfile(doc2vecpath): dictionary = corpora.Dictionary.load(doc2vecpathDict) #corpus = gensim.corpora.mmcorpus.MmCorpus.load(doc2vecpathCorp) #gensim.corpora.mmcorpus.MmCorpus.serialize(doc2vecpathCorp) corpus = gensim.corpora.mmcorpus.MmCorpus(doc2vecpathCorp) lsi = models.LsiModel.load(doc2vecpath) #lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=30, chunksize=100000) #index = similarities.MatrixSimilarity(lsi[corpus], num_features=len(corpus)) vec_bow1 = dictionary.doc2bow(markov_sentence.lower().split()) vec_lsi1 = lsi[vec_bow1] #sims1 = index[vec_lsi1] # perform a similarity query against the corpus #print("Markov:"), #print(vec_lsi1[0][1]) if vec_lsi1[0][1]: return vec_lsi1[0][1] else: return 0.0 else: # wenn documents als plain text string übergeben wird docs = [documents] with codecs.open(stopword,'r', encoding='utf-8') as f: stopwords_tmp = f.read() f.close() stopwords = set(stopwords_tmp.strip().split()) texts = [[word for word in document.lower().split() if word not in stopwords] for document in docs] frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) dictionary.save(doc2vecpathDict) corpus = [dictionary.doc2bow(text) for text in texts] gensim.corpora.mmcorpus.MmCorpus.serialize(doc2vecpathCorp, corpus) #gensim.corpora.mmcorpus.MmCorpus.save(doc2vecpathCorp, corpus) #lda = gensim.models.LdaModel(corpus, id2word=dictionary, alpha='auto', num_topics=30, chunksize=100000, passes=2) lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=30, chunksize=100000) # index = similarities.MatrixSimilarity(lsi[corpus], num_features=len(corpus)) # # transform corpus to LSI space and index it lsi.save(doc2vecpath) vec_bow1 = dictionary.doc2bow(markov_sentence.lower().split()) vec_lsi1 = lsi[vec_bow1] #sims1 = index[vec_lsi1] # perform a similarity query against the corpus if vec_lsi1[0][1]: return vec_lsi1[0][1] else: return 0.0 def CalcDocSimilareeeeity(documents): raw_sentences = [] #tokens = nltk.word_tokenize(text) text_sent = nltk.sent_tokenize(documents) for a in text_sent: words = nltk.word_tokenize(a) #pprint.pprint(words), #print(type(a)) raw_sentences.append(words) sentences = [gensim.models.doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(raw_sentences)] if os.path.isfile(doc2vecpath): model = gensim.models.Doc2Vec.load(doc2vecpath) model = gensim.models.Doc2Vec(iter=30, size=400, workers=4, sorted_vocab=1, alpha=0.75, min_alpha=0.45) #min_count=1 model.sort_vocab() model.build_vocab(sentences) #for epoch in range(30): # model.train(sentences) model.init_sims(replace=False) # can read,write from, and also training -> more memory #model.init_sims(replace=True) # can only read from, but no more training -> less memory model.save(doc2vecpath) # vocabeln im Doc2Vec ###wordVocab = [k for (k, v) in model.vocab.iteritems()] #pprint.pprint(wordVocab) #sys.exit(0) #try: #sim = model.most_similar(positive=["Instagram"], negative=[], topn=10, restrict_vocab=None, indexer=None) ####sim = model.most_similar_cosmul(positive=[standardSentence], negative=[], topn=10) #sim = model.score(standardSentence, total_sentences=100, chunksize=100, queue_factor=2, report_delay=1) #sim_word = sim[0] #sim_calc = sim_word[1] #distance = model.wmdistance(text_markov, standardSentence) #für alle sätze im corpus, den check gegen den Markov erstellten Satz machen #model = word2vec.Word2Vec(text, size=200) stopword = "/home/100biere/demo/stopwordlist.de.txt" with codecs.open(stopword,'r', encoding='utf-8') as f: stopwords_tmp = f.read() f.close() stopwords = set(stopwords_tmp.strip().split()) #texts = [[word for word in documents.lower().split() if word not in stopwords] for document in documents] texts = [word for word in documents.lower().split() if word not in stopwords] frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts_final = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts_final) dictionary.save('/home/100biere/demo/demo.dict') corpus = [dictionary.doc2bow(text) for text in texts_final] lda = gensim.models.LdaMulticore(corpus, num_topics=25, chunksize=1000, passes=2) # train asymmetric alpha from data #lda.bound(corpus) standardSentence = u"Instagram - heute erklaeren wir euch, wie Ihr mehr Follower und mehr Fotos und Videos teilen koennt." text_markov = u"Ich schaue Instagram und gucken, ob das video abgespielt wird" vec_bow1 = dictionary.doc2bow(standardSentence.lower().split()) vec_bow2 = dictionary.doc2bow(text_markov.lower().split()) #print(lda[vec_bow1]) # get topic probability distribution for a document #print(lda[vec_bow2]) #v = lda.log_perplexity(vec_bow1, total_docs=10) #print (lda.show_topics(num_topics=10, num_words=10, log=False, formatted=True)) print('Perplexity Markov: '), perplex1 = lda.bound(vec_bow2) print (perplex1) print(lda[vec_bow1]) print('Perplexity Standard: '), print(lda[vec_bow2]) perplex2 = lda.bound(vec_bow1) print (perplex2) ###lda.update(corpus2) # update the LDA model with additional documents #print(lda[text_markov]) # text_markov, standardSentence #print("Similiarity:"), #print(sim) #print (model["Instagram"]) #except KeyError: # 1