# -*- coding: utf-8 -*- #!/usr/bin/python3.5 -S # python -m spacy.en.download python -m spacy.de.download # https://spacy.io/docs/#tutorials #https://www.w3.org/services/html2txt # CSS: http://codepen.io/explosion/pen/xEpgKz # CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer import time #pip install --upgrade 3to2 #pip install --upgrade language-check ### grammar checker ######## https://github.com/lpenz/atdtool/blob/master/atdtool/__init__.py ###########http://stackoverflow.com/questions/10252448/how-to-check-whether-a-sentence-is-correct-simple-grammar-check-in-python --> https://pypi.python.org/pypi/grammar-check https://pypi.python.org/pypi/language-check #Statistical spell- and (occasional) grammar-checker. http://lindat.mff.cuni.cz/services/korektor #https://github.com/ufal/korektor #/home/100biere/demo/tensorflow/models/syntaxnet# # https://wiki.python.org/moin/LanguageParsing # http://nlp.stanford.edu/software/lex-parser.shtml #https://github.com/jsvine/markovify # spell correction: http://norvig.com/spell-correct.html # grammar correct: http://www.abisource.com/projects/link-grammar/#download # python based grammar check based on learning https://www.openhub.net/p/grac # http://www.decontextualize.com/teaching/rwet/n-grams-and-markov-chains/ start_time = time.time() import os os.system('clear') # http://polyglot.readthedocs.io/en/latest/Installation.html import NP import RuleBasedSamples #import GensimCalcSimilarity from RuleBasedSamples import * import polyglot from polyglot.text import Text, Word import markovify from langdetect import detect import spacy import os.path import pprint import codecs import re import numpy import string import sys # import sys package, if not already imported from textblob_de import TextBlobDE from unidecode import unidecode def remove_non_ascii(text): return unidecode(text) #return unidecode(unicode(text, encoding = "utf-8")) def is_sublist(a, b): if a == []: return True if b == []: return False return b[:len(a)] == a or is_sublist(a, b[1:]) nlp = spacy.de.German() nounPhrases = [] nounPhrMarkov = [] posTagMarkov = [] ndEntity = [] stopword = "/home/100biere/demo/stopwordlist.de.txt" filename = "/home/100biere/demo/instagram_ohneumlauts.txt" writename = "/home/100biere/demo/output.txt" doc2vecpath = "/home/100biere/demo/Doc2Vec.bin" standardSentence = u"Instagram Community trifft sich in Berlin und Frankfurt am Main - heute erklaeren wir euch, wie Ihr mehr Follower und mehr Fotos und Videos teilen koennt." triesMarkov = 50 max_overlap_ratioMarkov = 0.75 max_overlap_totalMarkov = 14 # Get raw text as string. with codecs.open(filename,'r', encoding='utf-8') as f: text = f.read() f.close() text = remove_non_ascii(text) wordList = text.split(' ') nounPhrases = NP.tokenize(standardSentence, nlp) nameEntity = Text(standardSentence) for entity in nameEntity.entities: if entity[0][0].isupper() and entity not in ndEntity: # only uppercase allowed #print(entity.tag, entity) #print() ndEntity.append(entity[0]) # Build the model. text_model = markovify.Text(text, state_size=2) count = 0 while 1: ### original: markov_sample = text_model.make_sentence(tries=25, max_overlap_ratio=0.25, max_overlap_total=7) markov_sample = text_model.make_sentence(tries=triesMarkov, max_overlap_ratio=max_overlap_ratioMarkov, max_overlap_total=max_overlap_totalMarkov) if markov_sample is not None and markov_sample and len(markov_sample) > 45: #if markov_sample: count = count + 1 print("Count overall try:"), print(count) createGrammarBasedSamples(text, markov_sample, nlp) createNounPhraseSamples(text, markov_sample, nounPhrases, nlp) createNERBasedSamples(text, markov_sample, ndEntity) with codecs.open(writename,'w',encoding='utf-8') as f: f.write(markov_sample) f.close() ft = (time.time() - start_time) print("Script Runtime: --- ") print(ft) print(" ---- Seconds") sys.exit(0)