# -*- coding: utf-8 -*-
#!/usr/bin/python3.5 -S
# python -m spacy.en.download python -m spacy.de.download
# https://spacy.io/docs/#tutorials
#https://www.w3.org/services/html2txt
# CSS: http://codepen.io/explosion/pen/xEpgKz
# CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer
import time

#pip install --upgrade 3to2
#pip install --upgrade language-check

### grammar checker
######## https://github.com/lpenz/atdtool/blob/master/atdtool/__init__.py
###########http://stackoverflow.com/questions/10252448/how-to-check-whether-a-sentence-is-correct-simple-grammar-check-in-python --> https://pypi.python.org/pypi/grammar-check https://pypi.python.org/pypi/language-check

#Statistical spell- and (occasional) grammar-checker. http://lindat.mff.cuni.cz/services/korektor
#https://github.com/ufal/korektor

	#/home/100biere/demo/tensorflow/models/syntaxnet#
# https://wiki.python.org/moin/LanguageParsing
# http://nlp.stanford.edu/software/lex-parser.shtml

#https://github.com/jsvine/markovify

# spell correction: http://norvig.com/spell-correct.html

# grammar correct: http://www.abisource.com/projects/link-grammar/#download

# python based grammar check based on learning https://www.openhub.net/p/grac

# http://www.decontextualize.com/teaching/rwet/n-grams-and-markov-chains/

start_time = time.time()
import os
os.system('clear')

# http://polyglot.readthedocs.io/en/latest/Installation.html
import NP
import RuleBasedSamples
#import GensimCalcSimilarity
from RuleBasedSamples import *
import polyglot
from polyglot.text import Text, Word
import markovify
from langdetect import detect
import spacy
import os.path
import pprint
import codecs
import re
import numpy
import string
import sys  # import sys package, if not already imported
from textblob_de import TextBlobDE
from unidecode import unidecode

def remove_non_ascii(text):
	return unidecode(text)
	#return unidecode(unicode(text, encoding = "utf-8"))
	
def is_sublist(a, b):
    if a == []: return True
    if b == []: return False
    return b[:len(a)] == a or is_sublist(a, b[1:])
	

nlp 					= spacy.de.German()

nounPhrases 			= []
nounPhrMarkov 			= []
posTagMarkov 			= []
ndEntity 				= []

stopword				= "/home/100biere/demo/stopwordlist.de.txt"
filename 				= "/home/100biere/demo/instagram_ohneumlauts.txt"
writename 				= "/home/100biere/demo/output.txt"
doc2vecpath				= "/home/100biere/demo/Doc2Vec.bin"
standardSentence		= u"Instagram Community trifft sich in Berlin und Frankfurt am Main - heute erklaeren wir euch, wie Ihr mehr Follower und mehr Fotos und Videos teilen koennt."

triesMarkov				= 50
max_overlap_ratioMarkov = 0.75
max_overlap_totalMarkov	= 14

# Get raw text as string.
with codecs.open(filename,'r', encoding='utf-8') as f:
    text 	= f.read()
f.close()

text 		= remove_non_ascii(text)
wordList 	= text.split(' ')
nounPhrases = NP.tokenize(standardSentence, nlp)
nameEntity 	= Text(standardSentence)

for entity in nameEntity.entities:
	if entity[0][0].isupper() and entity not in ndEntity:	# only uppercase allowed
		#print(entity.tag, entity)
		#print()
		ndEntity.append(entity[0])
		
# Build the model.
text_model = markovify.Text(text, state_size=2)

count = 0
while 1:
	
	### original: markov_sample 	= text_model.make_sentence(tries=25, max_overlap_ratio=0.25, max_overlap_total=7)
	markov_sample 	= text_model.make_sentence(tries=triesMarkov, max_overlap_ratio=max_overlap_ratioMarkov, max_overlap_total=max_overlap_totalMarkov)
	if markov_sample is not None and markov_sample and len(markov_sample) > 45:
	#if markov_sample:	
		count = count + 1
		print("Count overall try:"),
		print(count)
		createGrammarBasedSamples(text, markov_sample, nlp)
		createNounPhraseSamples(text, markov_sample, nounPhrases, nlp)
		createNERBasedSamples(text, markov_sample, ndEntity)

with codecs.open(writename,'w',encoding='utf-8') as f:
	f.write(markov_sample)
	f.close()

ft = (time.time() - start_time)
print("Script Runtime: --- ")
print(ft)
print(" ---- Seconds")
sys.exit(0)