# -*- coding: utf-8 -*- #!/usr/bin/python3.5 -S # python -m spacy.en.download python -m spacy.de.download # https://spacy.io/docs/#tutorials #https://www.w3.org/services/html2txt # CSS: http://codepen.io/explosion/pen/xEpgKz # CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer import time #pip install --upgrade 3to2 #pip install --upgrade language-check ### grammar checker ######## https://github.com/lpenz/atdtool/blob/master/atdtool/__init__.py ###########http://stackoverflow.com/questions/10252448/how-to-check-whether-a-sentence-is-correct-simple-grammar-check-in-python --> https://pypi.python.org/pypi/grammar-check https://pypi.python.org/pypi/language-check #Statistical spell- and (occasional) grammar-checker. http://lindat.mff.cuni.cz/services/korektor #https://github.com/ufal/korektor # spell correction: http://norvig.com/spell-correct.html # grammar correct: http://www.abisource.com/projects/link-grammar/#download # python based grammar check based on learning https://www.openhub.net/p/grac # http://www.decontextualize.com/teaching/rwet/n-grams-and-markov-chains/ start_time = time.time() import os os.system('clear') # http://polyglot.readthedocs.io/en/latest/Installation.html import polyglot from polyglot.text import Text, Word import NP #import language_check from gensim import utils from gensim.models.doc2vec import LabeledSentence from gensim.models import Doc2Vec import gensim, logging import markovify from langdetect import detect import spacy from spacy.de import German import base64 import os.path import json import pprint import codecs import nltk import re import string from subprocess import call import sys # import sys package, if not already imported from textblob_de import TextBlobDE from unidecode import unidecode filename = "/home/100biere/demo/instagram_ohneumlauts.txt" with codecs.open(filename,'r', encoding='utf-8') as f: text = f.read() f.close() print ("... build") #brown = nltk.corpus.brown corpus = [word.lower() for word in text.split(" ")] # Train on 95% f the corpus and test on the rest spl = 95*len(corpus)/100 train = corpus[:spl] test = corpus[spl:] # Remove rare words from the corpus fdist = nltk.FreqDist(w for w in train) vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems()))) train = map(lambda x: x if x in vocabulary else "*unknown*", train) test = map(lambda x: x if x in vocabulary else "*unknown*", test) print ("... train") #from nltk.model.ngram import NgramModel from nltk.probability import LidstoneProbDist estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = nltk.model.ngram.NgramModel(5, train, estimator=estimator) print ("len(corpus) = %s, len(vocabulary) = %s, len(train) = %s, len(test) = %s" % ( len(corpus), len(vocabulary), len(train), len(test) )) print ("perplexity(test) =", lm.perplexity(test))