# -*- coding: utf-8 -*-
#!/usr/bin/python2.7 -S
# python -m spacy.en.download python -m spacy.de.download
# https://spacy.io/docs/#tutorials
# CSS: http://codepen.io/explosion/pen/xEpgKz
# CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer
import time

start_time = time.time()

from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
import gensim, logging
import markovify
from langdetect import detect
#import spacy
import base64
import json
import pprint
import codecs
import nltk
import re
import sys  # import sys package, if not already imported
reload(sys)
sys.setdefaultencoding('utf-8')
from textblob_de import TextBlobDE as TextBlob

def only_letters(string):
	CHECK_RE = re.compile('[a-zA-Z]+$')
	lol = CHECK_RE.match(string)
	pprint.pprint(string)
	pprint.pprint(lol)
	return

#de_nlp = spacy.load('de', tagger=True, parser=True, entity=True)

filename = "/home/100biere/v3test/input.txt"
# Get raw text as string.
with codecs.open(filename,'r',encoding='utf8') as f:
    text = f.read()

#pprint.pprint(sentences)
#sys.exit(0)

# Build the model.
text_model = markovify.Text(text)

# Print five randomly-generated sentences
#for i in range(5):
#    print(text_model.make_sentence())

markov_sample 	= text_model.make_sentence()
'''
de_doc 			= de_nlp(markov_sample)    
noun_phrases 	= de_doc.noun_chunks
#pprint.pprint(noun_phrases)
'''
blob 			= TextBlob(text)
blob_markov 	= TextBlob(markov_sample)

'''
print("Noun Chunks von Spacy:")
for np in de_doc.noun_chunks:
    print(np.text)
'''
nounPhrases 	= []
nounPhrMarkov 	= []
word 			= u("Baby")

#print("Noun Phrases von TextBlog:")
for np in blob.noun_phrases:
	v 			= np.encode('utf-8', 'xmlcharrefreplace')
	
	ok_upper	= 0
	ok_only 	= 0
	
	if only_letters(v):
		ok_only = 1
		
	for l in v:
		if l.isupper():
			ok_upper = 1
			
	if ok_upper == 1 and ok_only == 1:
		nounPhrases.append(v)
		print(v)
	
	print(v),
	print(ok_only),
	print(ok_upper)
	print("\n")
'''	
#pprint.pprint(nounPhrases)
###print("#########################")
#pprint.pprint(nounPhrMarkov)
for np in blob_markov.noun_phrases:
	#vv = np.encode('utf-8', 'xmlcharrefreplace')
	#nounPhrMarkov.append(vv)
	#matches = {x for x in nounPhrases if x in vv}
	#pprint.pprint("Noun Phrase: "+vv),
	pprint.pprint(len(matches))
'''
if any(x in blob_markov.noun_phrases for x in blob.noun_phrases):	
	pprint.pprint("Noun Phrase hit: ")
	
#print("#########################")
print("Markov Sample Text: " + markov_sample)

raw_sentences = []
#tokens = nltk.word_tokenize(text)
text_sent = nltk.sent_tokenize(text)
for a in text_sent:
	words = nltk.word_tokenize(a)
	#pprint.pprint(words),
	#print(type(a))
	raw_sentences.append(a)

sentences 	= [gensim.models.doc2vec.TaggedDocument(words, [i]) for i, words in enumerate(raw_sentences)]

model 		= gensim.models.Doc2Vec(sentences, iter=50, size=400, workers=1, sorted_vocab=1, alpha=0.325,min_alpha=0.225, min_count=1)
model.init_sims(replace=False) 	# can read,write from, and also training -> more memory
#model.init_sims(replace=True)	# can only read from, but no more training -> less memory
#model.save(path)
#try:
	sim 	= model.most_similar(positive=[word], negative=[], topn=5)
	v 		= sim[0]
	vvv 	= v[1]
	print(vvv)
#except KeyError:
#	1	

# java -jar /home/100biere/software/LanguageTool-3.5/languagetool-commandline.jar -adl -a /home/100biere/input.txt
print("Script Runtime: --- %s seconds ---" % (time.time() - start_time))
sys.exit(0)