# -*- coding: utf-8 -*- #!/usr/bin/python3.5 -S # python -m spacy.en.download python -m spacy.de.download # https://spacy.io/docs/#tutorials #https://www.w3.org/services/html2txt # CSS: http://codepen.io/explosion/pen/xEpgKz # CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer import time #pip install --upgrade 3to2 #pip install --upgrade language-check ### grammar checker ######## https://github.com/lpenz/atdtool/blob/master/atdtool/__init__.py ###########http://stackoverflow.com/questions/10252448/how-to-check-whether-a-sentence-is-correct-simple-grammar-check-in-python --> https://pypi.python.org/pypi/grammar-check https://pypi.python.org/pypi/language-check #Statistical spell- and (occasional) grammar-checker. http://lindat.mff.cuni.cz/services/korektor #https://github.com/ufal/korektor #/home/100biere/demo/tensorflow/models/syntaxnet# # https://wiki.python.org/moin/LanguageParsing # http://nlp.stanford.edu/software/lex-parser.shtml #https://github.com/jsvine/markovify # spell correction: http://norvig.com/spell-correct.html # grammar correct: http://www.abisource.com/projects/link-grammar/#download # python based grammar check based on learning https://www.openhub.net/p/grac # http://www.decontextualize.com/teaching/rwet/n-grams-and-markov-chains/ start_time = time.time() import os os.system('clear') # http://polyglot.readthedocs.io/en/latest/Installation.html import NP import polyglot from polyglot.text import Text, Word from gensim import utils from gensim.models.doc2vec import LabeledSentence from gensim.models import Doc2Vec import gensim, logging import markovify from langdetect import detect import spacy import os.path import pprint import codecs import re import string import sys # import sys package, if not already imported from textblob_de import TextBlobDE from unidecode import unidecode def remove_non_ascii(text): return unidecode(text) #return unidecode(unicode(text, encoding = "utf-8")) def list_find(what, where): """Find `what` list in the `where` list. Return index in `where` where `what` starts or -1 if no such index. """ if not what: # empty list is always found return 0 try: index = 0 while True: index = where.index(what[0], index) if where[index:index+len(what)] == what: return index # found index += 1 # try next position except ValueError: return -1 # not found def contains(what, where): """Return [start, end+1] if found else empty list.""" i = list_find(what, where) return [i, i + len(what)] if i >= 0 else [] #NOTE: bool([]) == False def is_sublist(a, b): if a == []: return True if b == []: return False return b[:len(a)] == a or is_sublist(a, b[1:]) def createRuleBasedSamples(input, ccount): posTagMarkov = [] toks1 = nlp(input) for token1 in toks1: posTagMarkov.append(token1.pos_) #boolVal = contains(allowedRule, posTagMarkov) boolVal = is_sublist(allowedRule, posTagMarkov) if boolVal: print(ccount), print(" -> Good Markov Result (Rule-Based-Grammar):") print(input) print("\n") with codecs.open("good_rulebased_markov.log",'a',encoding='utf-8') as f: f.write("count: -> ") f.write('% 6.2f' % ccount) f.write(" -> ") f.write("triesMarkov: ") f.write('% 6.2f' % triesMarkov) f.write(" ---> max_overlap_ratioMarkov: ") f.write('% 6.2f' % max_overlap_ratioMarkov) f.write(" ---> max_overlap_totalMarkov: ") f.write('% 6.2f' % max_overlap_totalMarkov) f.write(" ---> ####### -> ") f.write(input) f.write("\n") f.close() #time.sleep(3) else: print(ccount), print(" -> No Markov Result (Rule-Based-Grammar)") return True def createNounPhraseSamples(input, ccount): iNP = NP.tokenize(input, nlp) #iNP = ["fotos", "kamera"] goodFlag = False #flache liste von iNP -> immer nur ein Element, und da stecken dann nPList = nounPhrases[0] tmpList = iNP[0] c = 0 for ele1 in nPList: for ele2 in tmpList: #c = c + 1 #print(c), #print(ele2) if ele1 == ele2 and c < len(nPList): c = c + 1 elif ele1 == ele2 and c == len(nPList): c = c + 1 # print("Noun Phrases (Markov):"), # print(ele1) goodFlag = True break ''' for ndx1, member1 in enumerate(nounPhrases): for ndx2, member2 in enumerate(iNP): print(iNP[ndx2]) if member1 == member2: print("Noun Phrases (Markov):"), print(iNP[ndx2]) goodFlag = True break ''' ''' a = [] b = [] for ndx, member in enumerate(iNP): c = tuple(member) a.append(c) for ndx, member in enumerate(nounPhrases): c = tuple(member) b.append(c) a = set(a) b = set(b) print("Sets A:"), print(a) print("Sets B:"), print(b) if "videos" in a: print("wort videos in SET A enthalten") if "videos" in b: print("wort videos in SET B enthalten") e = list(a & b) d = b.intersection(a) # .containsAll([2, 1]) print("\n") print("Noun Phrases (Orginal):"), print(b)#print(nounPhrases) print("\n") print("Noun Phrases (Markov):"), print(a)#print(iNP) print("\n") print("Lamda Intersect:"), #print(d) print(e) print("\n") #time.sleep(1.411) sys.exit(0) #if len(list3)>1: # sys.exit(0) ''' if goodFlag: print(ccount), print(" -> Good Markov Result (Noun Phrase):") print(input) print("\n") with codecs.open("good_nounphrase_markov.log",'a',encoding='utf-8') as f: f.write("count: -> ") f.write('% 6.2f' % ccount) f.write(" -> ") f.write("triesMarkov: ") f.write('% 6.2f' % triesMarkov) f.write(" ---> max_overlap_ratioMarkov: ") f.write('% 6.2f' % max_overlap_ratioMarkov) f.write(" ---> max_overlap_totalMarkov: ") f.write('% 6.2f' % max_overlap_totalMarkov) f.write(" ---> ####### -> ") f.write(input) f.write("\n") f.close() #time.sleep(3) else: 1 #print(ccount), #print(" -> No Markov Result (Noun Phrase)") return True nlp = spacy.de.German() nounPhrases = [] nounPhrMarkov = [] posTagMarkov = [] ####allowedRule = [u'DET',u'ADJ',u'NOUN',u'VERB',u'ADV',u'ADJ'] allowedRule = [u'DET',u'ADJ',u'NOUN'] stopword = "/home/100biere/demo/stopwordlist.de.txt" filename = "/home/100biere/demo/instagram_ohneumlauts.txt" writename = "/home/100biere/demo/output.txt" doc2vecpath = "/home/100biere/demo/Doc2Vec.bin" standardSentence = u"Instagram - heute erklaeren wir euch, wie Ihr mehr Follower und mehr Fotos und Videos teilen koennt." triesMarkov = 25 max_overlap_ratioMarkov = 0.75 max_overlap_totalMarkov = 15 # Get raw text as string. with codecs.open(filename,'r', encoding='utf-8') as f: text = f.read() f.close() text = remove_non_ascii(text) wordList = text.split(' ') nounPhrases = NP.tokenize(standardSentence, nlp) ''' doc = nlp(standardSentence) for np in doc.noun_chunks: print(np.root.tag_, np.text, np.root.ent_type_) sys.exit(0) ###nounPhrases = NP.tokenize(text) ###print("Noun Phrases:"), ###print(nounPhrases) print("NER Phrases:"), nameEntity = Text(text.encode()) #print(nameEntity) #sys.exit(0) ndEntity = [] for entity in nameEntity.entities: if entity[0][0].isupper() and entity not in ndEntity: # only uppercase allowed print(entity.tag, entity) print() ndEntity.append(entity) ''' # Build the model. text_model = markovify.Text(text, state_size=2) count = 0 while 1: count = count + 1 ### original: markov_sample = text_model.make_sentence(tries=25, max_overlap_ratio=0.25, max_overlap_total=7) markov_sample = text_model.make_sentence(tries=triesMarkov, max_overlap_ratio=max_overlap_ratioMarkov, max_overlap_total=max_overlap_totalMarkov) if markov_sample is not None and markov_sample and len(markov_sample) > 30: #createRuleBasedSamples(markov_sample, count) createNounPhraseSamples(markov_sample, count) with codecs.open(writename,'w',encoding='utf-8') as f: f.write(markov_sample) f.close() ft = (time.time() - start_time) print("Script Runtime: --- ") print(ft) print(" ---- Seconds") sys.exit(0)