# -*- coding: utf-8 -*-
#!/usr/bin/python3.5 -S
# python -m spacy.en.download python -m spacy.de.download
# https://spacy.io/docs/#tutorials
#https://www.w3.org/services/html2txt
# CSS: http://codepen.io/explosion/pen/xEpgKz
# CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer
import time

#pip install --upgrade 3to2
#pip install --upgrade language-check

### grammar checker
######## https://github.com/lpenz/atdtool/blob/master/atdtool/__init__.py
###########http://stackoverflow.com/questions/10252448/how-to-check-whether-a-sentence-is-correct-simple-grammar-check-in-python --> https://pypi.python.org/pypi/grammar-check https://pypi.python.org/pypi/language-check

#Statistical spell- and (occasional) grammar-checker. http://lindat.mff.cuni.cz/services/korektor
#https://github.com/ufal/korektor

# spell correction: http://norvig.com/spell-correct.html

# grammar correct: http://www.abisource.com/projects/link-grammar/#download

# python based grammar check based on learning https://www.openhub.net/p/grac

# http://www.decontextualize.com/teaching/rwet/n-grams-and-markov-chains/

start_time = time.time()
import os
os.system('clear')

# http://polyglot.readthedocs.io/en/latest/Installation.html
import polyglot
from polyglot.text import Text, Word
import NP
#import language_check
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
import gensim, logging
import markovify
from langdetect import detect
import spacy
from spacy.de import German
import base64
import os.path
import json
import pprint
import codecs
import nltk
import re
import string
from subprocess import call
import sys  # import sys package, if not already imported
from textblob_de import TextBlobDE
from unidecode import unidecode

def remove_non_ascii(text):
	return unidecode(text)
	#return unidecode(unicode(text, encoding = "utf-8"))
	
	#/home/100biere/demo/tensorflow/models/syntaxnet#
# https://wiki.python.org/moin/LanguageParsing
# http://nlp.stanford.edu/software/lex-parser.shtml

#de_nlp 		= spacy.load('de', tagger=True, parser=True, entity=True)
#de_nlp 		= spacy.de.German()

nounPhrases 	= []
nounPhrMarkov 	= []

stopword		= "/home/100biere/demo/stopwordlist.de.txt"
filename 		= "/home/100biere/demo/instagram_ohneumlauts.txt"
writename 		= "/home/100biere/demo/output.txt"
doc2vecpath		= "/home/100biere/demo/Doc2Vec.bin"
#tool 			= grammar_check.LanguageTool('de_DE')

standard_sentence		= "Instagram: Die Foto Community gilt als Jugendtreffpunkt"
# Get raw text as string.
with codecs.open(filename,'r', encoding='utf-8') as f:
    text = f.read()
f.close()

text = remove_non_ascii(text)


#https://github.com/jsvine/markovify

###nounPhrases 	= NP.tokenize(text)
###print("Noun Phrases:"),
###print(nounPhrases)

WordList = text.split(' ')

'''
print("NER Phrases:"),
nameEntity = Text(text.encode())
#print(nameEntity)
#sys.exit(0)

ndEntity = []
for entity in nameEntity.entities:
	if entity[0][0].isupper() and entity not in ndEntity:	# only uppercase allowed
		print(entity.tag, entity)
		print()
		ndEntity.append(entity)
		

'''
# Build the model.
text_model = markovify.Text(text, state_size=2)

print("Markov Text Sample:")
while 1:
	markov_sample 	= text_model.make_sentence(tries=10, max_overlap_ratio=0.25, max_overlap_total=7)
	if markov_sample is not None and markov_sample and len(markov_sample) > 30:
		print(markov_sample)
		print("\n")
		break

print("\n")


with codecs.open(writename,'w',encoding='utf-8') as f:
	f.write(markov_sample)
	f.close()
'''
###http://norvig.com/spell-correct.html
print("LanguageTool Corrected Text: ")
#http://wiki.languagetool.org/command-line-options
os.system("java -jar /home/100biere/software/LanguageTool-3.5/languagetool-commandline.jar -adl -a /home/100biere/demo/output.txt")
print("\n")
'''

print("Calculate BLEU Score: "),
BLEUscore = nltk.translate.bleu_score.sentence_bleu([text], markov_sample, weights=(0.85, 0.15))
print(BLEUscore)
print('%012.1f' % BLEUscore)
print("{0:.15f}".format(BLEUscore))
print("\n")

ft = (time.time() - start_time)
print("Script Runtime: --- "),
print(ft),
print(" ---- Seconds")
sys.exit(0)

raw_sentences = []
#tokens = nltk.word_tokenize(text)
text_sent = nltk.sent_tokenize(text)
for a in text_sent:
	words = nltk.word_tokenize(a)
	#pprint.pprint(words),
	#print(type(a))
	raw_sentences.append(words)

sentences 	= [gensim.models.doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(raw_sentences)]

if os.path.isfile(doc2vecpath):
	model 		= gensim.models.Doc2Vec.load(doc2vecpath)

model 		= gensim.models.Doc2Vec(iter=50, size=400, workers=4, sorted_vocab=1, alpha=0.075, min_alpha=0.045) #min_count=1
model.sort_vocab()
model.build_vocab(sentences)

for epoch in range(40):
	model.train(sentences)

model.init_sims(replace=False) 	# can read,write from, and also training -> more memory
#model.init_sims(replace=True)	# can only read from, but no more training -> less memory
model.save(doc2vecpath)

# vocabeln im Doc2Vec
###wordVocab = [k for (k, v) in model.vocab.iteritems()]
#pprint.pprint(wordVocab)
#sys.exit(0)

#try:
#sim 		= model.most_similar(positive=[standard_sentence], negative=[], topn=5)
sim 		= model.most_similar("Instagram")
sim_word	= sim[0]
sim_calc 	= sim_word[1]
#print("Similiarity:"),
#print(sim)
#except KeyError:	
#	1
	
ft = (time.time() - start_time)
print("Script Runtime: --- "),
print(ft),
print(" ---- Seconds")