# -*- coding: utf-8 -*-
#!/usr/bin/python2.7 -S
# python -m spacy.en.download python -m spacy.de.download
# https://spacy.io/docs/#tutorials
# CSS: http://codepen.io/explosion/pen/xEpgKz
# CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer
import time
start_time = time.time()

# pip install --upgrade thinc
# pip3 install suds-jurko
import spacy
#import base64
import json
#import site
import codecs
import locale
import shelve
import gensim, logging
import pprint
import os.path
#import sphinxapi as SP
from sphinxapi import *
# pip install --upgrade mysql-python
import MySQLdb
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# pip install --upgrade spacy
# pip3 install --upgrade spacy
# apt-get install python3-pip
# pip install --upgrade pip
# pip3 install --upgrade pip3
# pip install -U textblob
# python -m textblob.download_corpora
# pip install --upgrade langdetect
# pip install -U textblob-de
# python -m textblob.download_corpora
#from textblob_de import TextBlobDE as TextBlob
from langdetect import detect
#from textblob_de import TextBlobDE
#from textblob import TextBlob

from libleipzig import * 
# https://github.com/palday/libleipzig-python
# pip install --upgrade pattern
import sys, time  # import sys package, if not already imported
reload(sys)
sys.setdefaultencoding('utf-8')
#sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout)

synonym_cache = '/home/100biere/cache/synonym_cache.shelve'

def get_synonyms(search):
	docids		= []
	resList		= []
	
	q 			= '@word '+search
	#q 			= '@synonyms Haus'
	mode 		= SPH_MATCH_EXTENDED
	host 		= 'localhost'
	port 		= 9312
	index 		= 'onetopp'
	filtercol 	= 'group_id'
	filtervals 	= []
	sortby 		= '@relevance desc'
	groupby 	= ''
	groupsort 	= '@group desc'
	limit 		= 5	# muss mindestens 1 sein
	weight 		= {'words': [100], 'synonyms': [65]}

	cl 			= SphinxClient()
	#cl.SetServer ( host, port )
	cl.SetServer("/var/run/searchd.sock", 0)
	cl.SetConnectTimeout (7.3);
	cl.SetMatchMode (mode)
	###cl.SetFieldWeights( weight ) # klappt noch nicht
	#cl.SetFilter( filtercol, filtervals )
	cl.SetGroupBy(groupby, SPH_GROUPBY_ATTR, groupsort)
	cl.SetSortMode(SPH_SORT_EXTENDED, sortby)
	cl.SetLimits(0, limit, max(limit,10))
	cl.SetRankingMode(SPH_RANK_BM25);
	# do query

	res = ""
	try: 
		res = cl.Query( q, index )
	except Exception:
		print('query failed: %s' % cl.GetLastError())
		#sys.exit(1)
		return res;
		
	if cl.GetLastWarning():
		print('WARNING: %s\n' % cl.GetLastWarning())
		
#	print('Query \'%s\' retrieved %d of %d matches in %s sec' % (q, res['total'], res['total_found'], res['time']))
#	print('Query stats:')

#	if 'words' in res:
#		for info in res['words']:
#			print('\t\'%s\' found %d times in %d documents' % (info['word'], info['hits'], info['docs']))

	if 'matches' in res:
		n = 1
	#	print('\nMatches:')
		for match in res['matches']:
		#	attrsdump = ''
		#	for attr in res['attrs']:
		#		attrname = attr[0]
		#		attrtype = attr[1]
		#		value = match['attrs'][attrname]
		#		if attrtype==SPH_ATTR_TIMESTAMP:
		#			value = time.strftime ( '%Y-%m-%d %H:%M:%S', time.localtime(value) )
		#		attrsdump = '%s, %s=%s' % ( attrsdump, attrname, value )

		#	print('%d. doc_id=%s, weight=%d%s' % (n, match['id'], match['weight'], attrsdump))
			docids.append(match['id'])
			n += 1
			

	#pprint.pprint(docids)
	SqlQuery = "SELECT word, synonyms FROM synonym_unique WHERE";
	for i, val in enumerate(docids):	
		sql_str = str(val)
		if i>0:
			#SqlQuery += "\""+sql_str+"\","
			SqlQuery += sql_str+","
		elif i<=0:
			SqlQuery += " `uid` IN (\""+sql_str+"\","

	SqlQuery	= SqlQuery[:-1]
	SqlQuery 	+= ")"
	#pprint.pprint("Das ist mein SQL: "+SqlQuery)
	#SqlQuery = substr($SqlQuery,0,(strlen($SqlQuery)-1));

	# Open database connection
	# https://www.tutorialspoint.com/python/python_database_access.htm

	if len(docids)>=1:
		#pprint.pprint("Das ist mein SQL: "+SqlQuery)
		try:
		   dbs = MySQLdb.connect(user="root",
								passwd="###########99",
								host="127.0.0.1",
								unix_socket="/var/run/mysqld/mysqld.sock",
								port=3306,
								db="onetopp")
		   
		   # Execute the SQL command
		   cursor = dbs.cursor()
		   cursor.execute(SqlQuery)
		   dbs.commit()
		   
		   # Fetch all the rows in a list of lists.
		   results = cursor.fetchall()
		   
		   #pprint.pprint(results)
		   #sys.exit(0)
		   # disconnect from server
		   cursor.close()
		   dbs.close()
		  
		   for row in results:
			  word 		= row[0]
			  synonyms 	= row[1]
			  w 		= synonyms.split(";")
			  resList.append(w)
			  # Now print fetched result
			  # print "HITS: word=%s,synonyms=%s" % (word, synonyms )
			  
		except MySQLdb.Error, e:
			print "Error %d: %s" % (e.args[0],e.args[1])
			print(cursor._last_executed)
			print "Error: unable to fetch data"
			#for row in cursor:
			#	print(row)
		
	#my_list = set(resList) # make list unique
	unique = reduce(lambda l, x: l.append(x) or l if x not in l else l, resList, [])
	return unique

def read_similarity(word, sentences):
	wv 		= 0.00000
	
	if word and len(word)>=2 and len(sentences)>=2:
		path 	= "/home/100biere/pretrained/"+word+".w2v.bin"
		#print("Working on: "+word)
	
		# eventuell später: "similar_by_word(word, topn=10, restrict_vocab=None)" aus https://radimrehurek.com/gensim/models/word2vec.html verwenden, es gibt genau das passende wort dazu aus
		# eventuell dazu die gesamte Synonym SQL Datenbank mal einmal mit WordVec trainieren und dann nur noch auf das Modell zugreifen, wenn benötigt
			
		if os.path.isfile(path):
			print ("Trained Gensim Model for Word found: "+word)
			model 	= gensim.models.Word2Vec.load(path)
			sim 	= model.most_similar(positive=[word], negative=[], topn=1)#model.similar_by_word(word)
			pprint.pprint(sim)
			try:
				sim 	= model.most_similar(positive=[word], negative=[], topn=1)
				v 		= sim[0]
				vv 		= v[1]
				return vv
			except KeyError:
				1
			
		else: # do the training
			print ("Doing GENSIM Training -> NO Model for Word found: "+word)
			model 	= gensim.models.Word2Vec(sentences, sg=1, hs=1, iter=50, size=400, workers=1, sorted_vocab=1, alpha=0.325, min_count=1)
			model.init_sims(replace=False) 	# can read,write from, and also training -> more memory
			#model.init_sims(replace=True)	# can only read from, but no more training -> less memory
			model.save(path)
			try:
				sim 	= model.most_similar(positive=[word], negative=[], topn=1)
				v 		= sim[0]
				vv 		= v[1]
				return vv
			except KeyError:
				1	
			
	return wv

def get_key(key):
    try:
        return int(key)
    except ValueError:
        return key
		
def exchange_word(o):	
	
	j 	= str(o)
	if len(j) <= 1: # Wenn Original Wort Länge mehr als 2 Zeichen ist
		return o
	
	ww 		= unicode(j, "utf-8")
	#r 		= uni_leipzig(ww)
	r 		= get_synonyms(ww)
	
	#pprint.pprint(ww)
	#pprint.pprint(r)
	#sys.exit(0)
	if not r:
		return j
	
	#pprint.pprint(r)
	words 	= r #r.split(",")
	#print ("Exchange Word: exchange_word(o)")
	
	n 		= {}
	nn		= ""
	for mm in words:
		m = mm[0]
	#	print(type(m))
	#	pprint.pprint(m)
	#	my_a = m #m[m.find('"')+len('"'):m.rfind('"')]
	#	my_b = m #unicode(my_a, "utf-8")
		if len(m) >= 2: # Wenn Synonym Länge mehr als 2 Zeichen ist
		#	print(type(m))
		#	print(m)
			rs = read_similarity(m, r)
		#	print("for m in words: Similarity:"), 
		#	print(rs)
			n[m] = rs
		
		#print(type(my_b))
		#print("String: "+my_b)
	
	#sys.exit(0)
	
	#print(type(words))
	#print("no schrottinger")
	#sys.exit(1)
#	for i, val in enumerate(r):
#		print(val)
#		w = val[0]
#		if len(w) >= 2: # Wenn Synonym Länge mehr als 2 Zeichen ist
#			rs = read_similarity(w, r)
#			n[w] = rs
#	
#	sys.exit(1)
	###print ("Synonyme mit Similarity Score:")
	###pprint.pprint(n)
	
	if len(n) >= 1:
		# hole das Element aus dem Dict, das den höhsten Similarity Score hat
		nn = max(n, key=lambda key: n[key])
		
	# wenn das Originalwort und das Synonym gleich sind, dann lösche es aus dem Dict und teste neu
	
	#nn = unicode(nn, "utf-8")
	nn = nn.decode("utf-8", "ignore")
	if nn == ww:
		print("Originalwort und das Synonym gleich sind -> NEU")
		del n[nn]
		if len(n) > 0:
			nn = max(n, key=lambda key: n[key])
			nn = unicode(nn, "utf-8")	
		
	rsr = read_similarity(nn, r)

###	if rsr <= 0.011:
###		return ww
	
	
	print("\tOriginal: "+ww+" -> Synonym: "+nn)
	print("\tSynonym Similarity Score:"),
	print(rsr)
	#print("\tAlle Synonyme mit Similarity Score:")
	#pprint.pprint(n)
	
	return nn
		
#oDoc 	= (u'''Auf der Autobahn fahren viele Autos mit hoher Geschwindigkeit.''')
#oDoc 	= (u"Der Geschäftsführer ist im Meeting und fragt nach der Telefonnummer von seiner schönen Frau. Sie ist gerade mit ihren kleinen Kindern im Kindergarten und trifft danach ihre Freundinnen im Cafe.")

with codecs.open("/home/100biere/input.txt",'r',encoding='utf8') as f:
    oDoc = f.read()

lang 	= detect(oDoc)
print ("Language detected: "+lang)
print ("Text: "+oDoc)

if lang == "en":
	de_nlp = spacy.load('en', tagger=True, parser=False, entity=False)
	from textblob import TextBlob
	from textblob import Word
	from pattern import singularize, pluralize
	
elif lang == "de":
	de_nlp = spacy.load('de', tagger=True, parser=False, entity=False)
	from textblob_de import TextBlobDE as TextBlob
	from textblob_de import Word
	from pattern.de import singularize, pluralize
	
else:
	de_nlp = spacy.load('de', tagger=True, parser=False, entity=False)
	from textblob_de import TextBlobDE as TextBlob
	from textblob_de import Word
	from pattern.de import singularize, pluralize

print ("Tagging & Tokenizing Text")

oDoc_nlp 		= de_nlp(oDoc, tag=True, parse=False)    
nDoc 			= ""
Tokens 			= [o for o in oDoc_nlp]

noun_max_change = 3
adj_max_change	= 2
verb_max_change	= 2

adj_count		= 0
noun_count		= 0
verb_count		= 0

'''
# http://www.clips.ua.ac.be/pages/pattern-de
word = "Katze"
print singularize(word)
print pluralize(word)
sys.exit(0)

pprint.pprint(Tokens)
sys.exit(0)
'''
for o in Tokens:
	j = str(o)
	new_word = unicode(j, "utf-8")
			
	if (o.pos_ == "NOUN"):
		if noun_count == noun_max_change:
			nDoc += " "+new_word
		else:
			ew = exchange_word(new_word)
			nDoc += " "+'<mark data-entity="noun">'+ew+'</mark>'
			noun_count += 1
		
	elif (o.pos_ == "VERB"):
		if verb_count == verb_max_change:
			nDoc += " "+new_word
		else:
			ew = exchange_word(new_word)
			nDoc += " "+'<mark data-entity="verb">'+ew+'</mark>'
			verb_count += 1
		
	elif (o.pos_ == "ADJ"):
		if adj_count == adj_max_change:
			nDoc += " "+new_word
		else:
			ew = exchange_word(new_word)
			nDoc += " "+'<mark data-entity="adj">'+ew+'</mark>'	
			adj_count += 1

	else:
		nDoc += " "+new_word

print("Originalsatz: " + oDoc)
print("Ausgabesatz: " + nDoc)

# process Unicode text
with codecs.open("/home/100biere/output.txt",'w',encoding='utf8') as f:
    f.write(nDoc)

print("Script Runtime: --- %s seconds ---" % (time.time() - start_time))
sys.exit(0)

def uni_leipzig(o):
	mo 	= str(o).lower()
	ow 	= str(o)
	
	db 	= shelve.open(synonym_cache)
	
	try:
		#print ("Read Cache: ")
		t 		= db[mo] # retrieve cache	
		data  	= json.loads(t)
		db.close()
		
		# cache hit
		if data:
			print ("uni_leipzig() Cache hit for Entry: "+mo)
			#pprint.pprint(data)
			calc_similarity(mo, data)
			return data

	except Exception:
		1

	for x in range(0, 3):
		try:
			#r = Cooccurrences("warmen", 5, 10)
			uo = unicode(ow, "utf-8")
			print(x),
			print(" -> WebCall for Wort:"),
			print(uo)
		
		# mysql -u root -p openthesaurus < openthesaurus_dump.sql
		#SELECT word FROM `term` WHERE `synset_id` = "1" AND `language_id` = "2"
		
		# https://github.com/arbox/wlapi/blob/master/test/fixtures/vcr_cassettes/synonyms.yml
			r = Thesaurus(ow, 20)
			pprint.pprint(r)
			
			# if we have a full list of Thesaurus Elements then we are happy
			if r:
				b1 = json.dumps(r)
				db = shelve.open(synonym_cache)
				db[mo] = b1 # store cache
				db.close()
				calc_similarity(mo, b1)
				print ("uni_leipzig() Cache Write for Entry: "+mo)
				pprint.pprint(b1)
				return b1
				
		#except Exception as e:
		except Exception as e:
			exc_type, exc_obj, exc_tb = sys.exc_info()
			fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
			print(exc_type, fname, exc_tb.tb_lineno)
			print("Error while doing WebCall to Uni Leipzig")
			continue
		
	return False
	
# http://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
# https://www.quora.com/Is-skip-gram-negative-sampling-better-than-CBOW-NS-for-word2vec-If-so-why
def calc_similarity(word, sentences):
	if word and len(word)>=2 and len(sentences)>=2:
		path 	= "/home/100biere/pretrained/"+word+".w2v.bin"
		
		# if file is existing, we have a trained model and can savely return
		if os.path.isfile(path):
			print ("Calculating Word2Vec for Input Word: "+ word)
			model 	= gensim.models.Word2Vec.load(path)
			model2 	= gensim.models.Word2Vec(model, sg=1, hs=1, iter=400, size=400, workers=1, sorted_vocab=1, alpha=0.325, min_count=1)
			model2.init_sims(replace=False) 	# can read,write from, and also training -> more memory
			#model.init_sims(replace=True)	# can only read from, but no more training -> less memory
			model2.save(path)	
		else:
			model2 	= gensim.models.Word2Vec(path, sg=1, hs=1, iter=400, size=400, workers=1, sorted_vocab=1, alpha=0.325, min_count=1)
			model2.init_sims(replace=False) 	# can read,write from, and also training -> more memory
			#model.init_sims(replace=True)	# can only read from, but no more training -> less memory
			model2.save(path)	
		
	return 1