# -*- coding: utf-8 -*- #!/usr/bin/python3.5 -S # python -m spacy.en.download python -m spacy.de.download # https://spacy.io/docs/#tutorials #https://www.w3.org/services/html2txt # CSS: http://codepen.io/explosion/pen/xEpgKz # CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer import time #pip install --upgrade 3to2 #pip install --upgrade language-check ### grammar checker ######## https://github.com/lpenz/atdtool/blob/master/atdtool/__init__.py ###########http://stackoverflow.com/questions/10252448/how-to-check-whether-a-sentence-is-correct-simple-grammar-check-in-python --> https://pypi.python.org/pypi/grammar-check https://pypi.python.org/pypi/language-check #Statistical spell- and (occasional) grammar-checker. http://lindat.mff.cuni.cz/services/korektor #https://github.com/ufal/korektor # spell correction: http://norvig.com/spell-correct.html # grammar correct: http://www.abisource.com/projects/link-grammar/#download # python based grammar check based on learning https://www.openhub.net/p/grac # http://www.decontextualize.com/teaching/rwet/n-grams-and-markov-chains/ start_time = time.time() import os os.system('clear') # http://polyglot.readthedocs.io/en/latest/Installation.html import polyglot from polyglot.text import Text, Word import NP #import language_check from gensim import utils from gensim.models.doc2vec import LabeledSentence from gensim.models import Doc2Vec import gensim, logging import markovify from langdetect import detect import spacy from spacy.de import German import base64 import os.path import json import pprint import codecs import nltk import re import string from subprocess import call import sys # import sys package, if not already imported from textblob_de import TextBlobDE from unidecode import unidecode def remove_non_ascii(text): return unidecode(text) #return unidecode(unicode(text, encoding = "utf-8")) #/home/100biere/demo/tensorflow/models/syntaxnet# # https://wiki.python.org/moin/LanguageParsing # http://nlp.stanford.edu/software/lex-parser.shtml #de_nlp = spacy.load('de', tagger=True, parser=True, entity=True) #de_nlp = spacy.de.German() nounPhrases = [] nounPhrMarkov = [] stopword = "/home/100biere/demo/stopwordlist.de.txt" filename = "/home/100biere/demo/instagram_ohneumlauts.txt" writename = "/home/100biere/demo/output.txt" doc2vecpath = "/home/100biere/demo/Doc2Vec.bin" #tool = grammar_check.LanguageTool('de_DE') standard_sentence = "Instagram: Die Foto Community gilt als Jugendtreffpunkt" # Get raw text as string. with codecs.open(filename,'r', encoding='utf-8') as f: text = f.read() f.close() text = remove_non_ascii(text) #https://github.com/jsvine/markovify ###nounPhrases = NP.tokenize(text) ###print("Noun Phrases:"), ###print(nounPhrases) WordList = text.split(' ') ''' print("NER Phrases:"), nameEntity = Text(text.encode()) #print(nameEntity) #sys.exit(0) ndEntity = [] for entity in nameEntity.entities: if entity[0][0].isupper() and entity not in ndEntity: # only uppercase allowed print(entity.tag, entity) print() ndEntity.append(entity) ''' # Build the model. text_model = markovify.Text(text, state_size=2) print("Markov Text Sample:") while 1: markov_sample = text_model.make_sentence(tries=10, max_overlap_ratio=0.25, max_overlap_total=7) if markov_sample is not None and markov_sample and len(markov_sample) > 30: print(markov_sample) print("\n") break print("\n") with codecs.open(writename,'w',encoding='utf-8') as f: f.write(markov_sample) f.close() ''' ###http://norvig.com/spell-correct.html print("LanguageTool Corrected Text: ") #http://wiki.languagetool.org/command-line-options os.system("java -jar /home/100biere/software/LanguageTool-3.5/languagetool-commandline.jar -adl -a /home/100biere/demo/output.txt") print("\n") ''' print("Calculate BLEU Score: "), BLEUscore = nltk.translate.bleu_score.sentence_bleu([text], markov_sample, weights=(0.85, 0.15)) print(BLEUscore) print('%012.1f' % BLEUscore) print("{0:.15f}".format(BLEUscore)) print("\n") ft = (time.time() - start_time) print("Script Runtime: --- "), print(ft), print(" ---- Seconds") sys.exit(0) raw_sentences = [] #tokens = nltk.word_tokenize(text) text_sent = nltk.sent_tokenize(text) for a in text_sent: words = nltk.word_tokenize(a) #pprint.pprint(words), #print(type(a)) raw_sentences.append(words) sentences = [gensim.models.doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(raw_sentences)] if os.path.isfile(doc2vecpath): model = gensim.models.Doc2Vec.load(doc2vecpath) model = gensim.models.Doc2Vec(iter=50, size=400, workers=4, sorted_vocab=1, alpha=0.075, min_alpha=0.045) #min_count=1 model.sort_vocab() model.build_vocab(sentences) for epoch in range(40): model.train(sentences) model.init_sims(replace=False) # can read,write from, and also training -> more memory #model.init_sims(replace=True) # can only read from, but no more training -> less memory model.save(doc2vecpath) # vocabeln im Doc2Vec ###wordVocab = [k for (k, v) in model.vocab.iteritems()] #pprint.pprint(wordVocab) #sys.exit(0) #try: #sim = model.most_similar(positive=[standard_sentence], negative=[], topn=5) sim = model.most_similar("Instagram") sim_word = sim[0] sim_calc = sim_word[1] #print("Similiarity:"), #print(sim) #except KeyError: # 1 ft = (time.time() - start_time) print("Script Runtime: --- "), print(ft), print(" ---- Seconds")