# -*- coding: utf-8 -*- #!/usr/bin/python3.5 -S # python -m spacy.en.download python -m spacy.de.download # https://spacy.io/docs/#tutorials # CSS: http://codepen.io/explosion/pen/xEpgKz # CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer import time #pip install --upgrade 3to2 #pip install --upgrade language-check start_time = time.time() # http://polyglot.readthedocs.io/en/latest/Installation.html import polyglot from polyglot.text import Text, Word import rake import NP import itertools #import language_check from gensim import utils from gensim.models.doc2vec import LabeledSentence from gensim.models import Doc2Vec import gensim, logging import markovify from langdetect import detect import spacy from spacy.de import German import base64 import os.path import json import pprint import codecs import nltk import re import string from subprocess import call import sys # import sys package, if not already imported from textblob_de import TextBlobDE def only_letters(string): #s = re.sub('[^0-9a-zA-Z]+', '*', s) lol = re.findall(r'[^A-Za-z-\s\d]',string) #pprint.pprint("Suche: "+string), #pprint.pprint(lol) if len(lol) > 0: return 0 else: #pprint.pprint(lol) #pprint.pprint(len(lol)) return 1 return 0 #de_nlp = spacy.load('de', tagger=True, parser=True, entity=True) #de_nlp = spacy.de.German() nounPhrases = [] nounPhrMarkov = [] stopword = "/home/100biere/demo/stopwordlist.de.txt" filename = "/home/100biere/demo/input.txt" writename = "/home/100biere/demo/output.txt" doc2vecpath = "/home/100biere/demo/Doc2Vec.bin" #tool = language_check.LanguageTool('de_DE') # https://github.com/zelandiya/RAKE-tutorial/blob/master/rake_tutorial.py rake_object1 = rake.Rake(stopword, 3, 1, 1) rake_object2 = rake.Rake(stopword, 3, 2, 1) rake_object3 = rake.Rake(stopword, 3, 3, 1) rake_object4 = rake.Rake(stopword, 3, 4, 1) rake_object5 = rake.Rake(stopword, 3, 5, 1) ''' rake_object = rake.Rake("SmartStoplist.txt", 5, 3, 4) Now, we have a RAKE object that extracts keywords where: Each word has at least 5 characters Each phrase has at most 3 words Each keyword appears in the text at least 4 times ''' # Get raw text as string. with codecs.open(filename,'r', encoding='utf8') as f: text = f.read() f.close() #pprint.pprint(sentences) #sys.exit(0) # Build the model. text_model = markovify.Text(text) # Print five randomly-generated sentences #for i in range(5): # print(text_model.make_sentence()) # markov_sample = text_model.make_sentence() # http://pythonhosted.org/pyenchant/faq.html while 1: markov_sample = text_model.make_sentence() if markov_sample is not None and markov_sample and len(markov_sample) > 30: with codecs.open(writename,'w',encoding='utf-8') as f: f.write(markov_sample) f.close() break #with codecs.open(writename,'r',encoding='latin-1') as f: # markov_sample = f.read() #f.close() with codecs.open(writename,'r',encoding='utf8') as f: markov_sample = f.read() f.close() #print("#########################") print("Markov Sample Text: "), print(markov_sample) print() keywordsRake1 = rake_object1.run(markov_sample) keywordsRake2 = rake_object2.run(markov_sample) keywordsRake3 = rake_object3.run(markov_sample) keywordsRake4 = rake_object4.run(markov_sample) keywordsRake5 = rake_object5.run(markov_sample) # 3. print results print("Rake Keywords 1:", keywordsRake1) print() print("Rake Keywords 2:", keywordsRake2) print() print("Rake Keywords 3:", keywordsRake3) print() print("Rake Keywords 4:", keywordsRake4) print() print("Rake Keywords 5:", keywordsRake5) print() markovWordList = markov_sample.split(" ") #markovWordListThree = map(lambda x: range(1, 3), map(str.decode("utf-8").split, [markov_sample])) c = [] ct = '' count = 0 for i in markovWordList: ct = ct + i + " " count += 1 if (count == 3): c.append(ct) count = 0 ct = "" print("List of three Words: "), print(c) sys.exit(0) #keywordsMarkow = rake_object.run(markov_sample) #print("RAKE Keywords Markov:", keywordsMarkow) #blob = TextBlobDE(markov_sample) #for tok in blob.tags: # print(tok) # #print("TextBlobDE Noun Phrases:", blob.noun_phrases) nounPhrases = NP.tokenize(text) nounPhrMarkov = NP.tokenize(markov_sample) #nounPhrases = [x[0].encode('utf-8') for x in nounPhrases] #nounPhrMarkov = [x[0].encode('utf-8') for x in nounPhrMarkov] print("Markov Noun Phrases:", nounPhrMarkov) print() print("Orignal Noun Phrases:") print(nounPhrases) print() #.encode('utf-8') blob = TextBlobDE(markov_sample) toks1 = blob.tags print("TextBlob POS Tagger:", toks1) print() nlp = spacy.load('de', tagger=True, parser=False, entity=False) nlp = spacy.de.German() toks2 = nlp(markov_sample) print("Spacy POS Tagger:") print(toks2) ''' print("LanguageTool Corrected Text: "), #http://wiki.languagetool.org/command-line-options os.system("java -jar /home/100biere/software/LanguageTool-3.5/languagetool-commandline.jar -adl -a /home/100biere/demo/output.txt") print() print() ''' nameEntity = Text(markov_sample.encode('utf-8')) for entity in nameEntity.entities: print(entity.tag, entity) print() keyword = "Tochter" if any(x in nounPhrMarkov for x in nounPhrases): print("Noun Phrase hit: ") if markov_sample in text: print("Version 1: Markov Sample Text kommt im Urpsrungskorpus vor: ") print("\n") list_mk = [] list_mk.append(markov_sample) if any(x in list_mk for x in text): print("Version 2: Markov Sample Text kommt im Urpsrungskorpus vor: ") print("\n") if keyword in text: print(keyword +" -> kommt in Korpus vor") print("\n") raw_sentences = [] #tokens = nltk.word_tokenize(text) text_sent = nltk.sent_tokenize(text) for a in text_sent: words = nltk.word_tokenize(a) #pprint.pprint(words), #print(type(a)) raw_sentences.append(words) sentences = [gensim.models.doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(raw_sentences)] if os.path.isfile(doc2vecpath): model = gensim.models.Doc2Vec.load(doc2vecpath) model = gensim.models.Doc2Vec(iter=50, size=400, workers=4, sorted_vocab=1, alpha=0.75, min_alpha=0.45) #min_count=1 model.sort_vocab() model.build_vocab(sentences) for epoch in range(40): model.train(sentences) model.init_sims(replace=False) # can read,write from, and also training -> more memory #model.init_sims(replace=True) # can only read from, but no more training -> less memory model.save(doc2vecpath) # vocabeln im Doc2Vec ###wordVocab = [k for (k, v) in model.vocab.iteritems()] #pprint.pprint(wordVocab) #sys.exit(0) try: sim = model.most_similar(positive=[keyword], negative=[], topn=5) sim_word = sim[0] sim_calc = sim_word[1] print("Similiarity:"), print(sim) except KeyError: 1 ft = (time.time() - start_time) print("Script Runtime: --- "), print(ft), print(" ---- Seconds")