# -*- coding: utf-8 -*-
#!/usr/bin/python3.5 -S
# python -m spacy.en.download python -m spacy.de.download
# https://spacy.io/docs/#tutorials
#https://www.w3.org/services/html2txt
# CSS: http://codepen.io/explosion/pen/xEpgKz
# CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer
import time

#pip install --upgrade 3to2
#pip install --upgrade language-check

### grammar checker
######## https://github.com/lpenz/atdtool/blob/master/atdtool/__init__.py
###########http://stackoverflow.com/questions/10252448/how-to-check-whether-a-sentence-is-correct-simple-grammar-check-in-python --> https://pypi.python.org/pypi/grammar-check https://pypi.python.org/pypi/language-check

#Statistical spell- and (occasional) grammar-checker. http://lindat.mff.cuni.cz/services/korektor
#https://github.com/ufal/korektor

	#/home/100biere/demo/tensorflow/models/syntaxnet#
# https://wiki.python.org/moin/LanguageParsing
# http://nlp.stanford.edu/software/lex-parser.shtml

#https://github.com/jsvine/markovify

# spell correction: http://norvig.com/spell-correct.html

# grammar correct: http://www.abisource.com/projects/link-grammar/#download

# python based grammar check based on learning https://www.openhub.net/p/grac

# http://www.decontextualize.com/teaching/rwet/n-grams-and-markov-chains/

start_time = time.time()
import os
os.system('clear')

# http://polyglot.readthedocs.io/en/latest/Installation.html
import NP
import polyglot
from polyglot.text import Text, Word
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
import gensim, logging
import markovify
from langdetect import detect
import spacy
import os.path
import pprint
import codecs
import re
import string
import sys  # import sys package, if not already imported
from textblob_de import TextBlobDE
from unidecode import unidecode

def remove_non_ascii(text):
	return unidecode(text)
	#return unidecode(unicode(text, encoding = "utf-8"))
	
def list_find(what, where):
    """Find `what` list in the `where` list.

    Return index in `where` where `what` starts
    or -1 if no such index.

    >>> f = list_find
    >>> f([2, 1], [-1, 0, 1, 2])
    -1
    >>> f([-1, 1, 2], [-1, 0, 1, 2])
    -1
    >>> f([0, 1, 2], [-1, 0, 1, 2])
    1
    >>> f([1,2], [-1, 0, 1, 2])
    2
    >>> f([1,3], [-1, 0, 1, 2])
    -1
    >>> f([1, 2], [[1, 2], 3])
    -1
    >>> f([[1, 2]], [[1, 2], 3])
    0
    """
    if not what: # empty list is always found
        return 0
    try:
        index = 0
        while True:
            index = where.index(what[0], index)
            if where[index:index+len(what)] == what:
                return index # found
            index += 1 # try next position
    except ValueError:
        return -1 # not found

def contains(what, where):
    """Return [start, end+1] if found else empty list."""
    i = list_find(what, where)
    return [i, i + len(what)] if i >= 0 else [] #NOTE: bool([]) == False	
	

def createRuleBasedSamples(input, ccount):
	posTagMarkov 	= []
	toks1 			= nlp(input)

	for token1 in toks1:
		posTagMarkov.append(token1.pos_)

	boolVal = contains(allowedRule, posTagMarkov)
	if boolVal:
		print(ccount), 
		print(" -> Good Markov Result (Rule-Based-Grammar):")
		print(input)
		print("\n")
		
		with codecs.open("good_rulebased_markov.log",'a',encoding='utf-8') as f:
			f.write("count: -> ")
			f.write('% 6.2f' % ccount)
			f.write(" -> ")
			f.write("triesMarkov: ")
			f.write('% 6.2f' % triesMarkov)
			f.write("  ---> max_overlap_ratioMarkov: ")
			f.write('% 6.2f' % max_overlap_ratioMarkov)
			f.write("  ---> max_overlap_totalMarkov: ")
			f.write('% 6.2f' % max_overlap_totalMarkov)
			f.write("  ---> ####### -> ")
			f.write(input)
			f.write("\n")
			f.close()
		#time.sleep(3)
		
	else:
		print(ccount), 
		print(" -> No Markov Result (Rule-Based-Grammar)")
	return True

def createNounPhraseSamples(input, ccount):
	iNP 		= NP.tokenize(input, nlp)
	boolNPVal 	= contains(nounPhrases, iNP)
	
	if boolNPVal:
		print(ccount), 
		print(" -> Good Markov Result (Noun Phrase):")
		print(input)
		print("\n")
		
		with codecs.open("good_nounphrase_markov.log",'a',encoding='utf-8') as f:
			f.write("count: -> ")
			f.write('% 6.2f' % ccount)
			f.write(" -> ")
			f.write("triesMarkov: ")
			f.write('% 6.2f' % triesMarkov)
			f.write("  ---> max_overlap_ratioMarkov: ")
			f.write('% 6.2f' % max_overlap_ratioMarkov)
			f.write("  ---> max_overlap_totalMarkov: ")
			f.write('% 6.2f' % max_overlap_totalMarkov)
			f.write("  ---> ####### -> ")
			f.write(input)
			f.write("\n")
			f.close()
		#time.sleep(3)
		
	else:
		print(ccount), 
		print(" -> No Markov Result (Noun Phrase)")
	return True

nlp 					= spacy.de.German()

nounPhrases 			= []
nounPhrMarkov 			= []
posTagMarkov 			= []

allowedRule				= [u'DET',u'ADJ',u'NOUN',u'VERB',u'ADV',u'ADJ']
####allowedRule			= [u'DET',u'ADJ',u'NOUN']

stopword				= "/home/100biere/demo/stopwordlist.de.txt"
filename 				= "/home/100biere/demo/instagram_ohneumlauts.txt"
writename 				= "/home/100biere/demo/output.txt"
doc2vecpath				= "/home/100biere/demo/Doc2Vec.bin"
standardSentence		= u"Instagram - heute erklaeren wir euch, wie Ihr mehr Follower und mehr Fotos und Videos teilen koennt."

triesMarkov				= 10
max_overlap_ratioMarkov = 0.65
max_overlap_totalMarkov	= 8

# Get raw text as string.
with codecs.open(filename,'r', encoding='utf-8') as f:
    text 	= f.read()
f.close()

text 		= remove_non_ascii(text)
wordList 	= text.split(' ')
nounPhrases = NP.tokenize(standardSentence, nlp)
'''
doc 			= nlp(standardSentence)
for np in doc.noun_chunks:
    print(np.root.tag_, np.text, np.root.ent_type_)
	
sys.exit(0)

###nounPhrases 	= NP.tokenize(text)
###print("Noun Phrases:"),
###print(nounPhrases)

print("NER Phrases:"),
nameEntity = Text(text.encode())
#print(nameEntity)
#sys.exit(0)

ndEntity = []
for entity in nameEntity.entities:
	if entity[0][0].isupper() and entity not in ndEntity:	# only uppercase allowed
		print(entity.tag, entity)
		print()
		ndEntity.append(entity)
		

'''
# Build the model.
text_model = markovify.Text(text, state_size=2)

count = 0
while 1:
	count = count + 1
	### original: markov_sample 	= text_model.make_sentence(tries=25, max_overlap_ratio=0.25, max_overlap_total=7)
	markov_sample 	= text_model.make_sentence(tries=triesMarkov, max_overlap_ratio=max_overlap_ratioMarkov, max_overlap_total=max_overlap_totalMarkov)
	if markov_sample is not None and markov_sample and len(markov_sample) > 30:
		#createRuleBasedSamples(markov_sample, count)
		createNounPhraseSamples(markov_sample, count)

with codecs.open(writename,'w',encoding='utf-8') as f:
	f.write(markov_sample)
	f.close()

ft = (time.time() - start_time)
print("Script Runtime: --- ")
print(ft)
print(" ---- Seconds")
sys.exit(0)