# -*- coding: utf-8 -*-
#!/usr/bin/python3.5 -S
import time
import gensim
import nltk
import os, sys
import logging
import gensim
from gensim import corpora
import codecs
from collections import defaultdict
from pprint import pprint  # pretty-printer
from gensim import corpora, models, similarities

# https://radimrehurek.com/gensim/apiref.html

#pip install pyemd
from gensim.models import word2vec
from gensim.models import doc2vec
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
#pip install --upgrade sklearn

word2vecpath	= "/home/100biere/demo/realdemo/Word2Vec.mod"
doc2vecpath		= "/home/100biere/demo/realdemo/Doc2Vec.mod"
doc2vecpathDict	= "/home/100biere/demo/realdemo/Doc2Vec.dict"
doc2vecpathCorp	= "/home/100biere/demo/realdemo/Doc2Vec.corp"
stopword 		= "/home/100biere/demo/realdemo/stopwordlist.de.txt"


def CalcCosineSimilarity(documents, markov_sentence):
	
	with codecs.open(stopword,'r', encoding='utf-8') as f:
		stopwords_tmp = f.read()
	f.close()
		
	stopwords = set(stopwords_tmp.strip().split())
	
	stemmer = nltk.stem.porter.PorterStemmer()
	remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

	def stem_tokens(tokens):
		return [stemmer.stem(item) for item in tokens]

	'''remove punctuation, lowercase, stem'''
	def normalize(text):
		return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

	vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words=stopwords)
	
	def cosine_sim(text1, text2):
		tfidf = vectorizer.fit_transform([text1, text2])
		return ((tfidf * tfidf.T).A)[0,1]
	
	print("CalcCosineSimilarity:"),
	print(cosine_sim(documents, markov_sentence))
	
	return True
	
	
def CalcDocSimilarity(documents, markov_sentence):
	
	if os.path.isfile(doc2vecpath):
		dictionary 	= corpora.Dictionary.load(doc2vecpathDict)
		#corpus = gensim.corpora.mmcorpus.MmCorpus.load(doc2vecpathCorp)
		#gensim.corpora.mmcorpus.MmCorpus.serialize(doc2vecpathCorp)
		corpus = gensim.corpora.mmcorpus.MmCorpus(doc2vecpathCorp)
		lsi = models.LsiModel.load(doc2vecpath)
		
		#lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=30, chunksize=100000)
	#	index = similarities.MatrixSimilarity(lsi[corpus], num_features=len(corpus)) 
		vec_bow1 = dictionary.doc2bow(markov_sentence.lower().split(), allow_update=False, return_missing=False)
		vec_lsi1 = lsi[vec_bow1] 
	#	sims1 	= index[vec_lsi1] # perform a similarity query against the corpus
	#	print("sim1 Wert:"),
	#	pprint(index)
	#	print(sims1)
		try:
			if vec_lsi1[0][1]:
				return vec_lsi1[0][1]
			else:
				return 0.0
		except Exception:
			return 0.0
	else:
		# wenn documents als plain text string übergeben wird
		docs = [documents]
		
		with codecs.open(stopword,'r', encoding='utf-8') as f:
			stopwords_tmp = f.read()
		f.close()
		
		stopwords = set(stopwords_tmp.strip().split())
		
		texts = [[word for word in document.lower().split() if word not in stopwords] for document in docs]
		
		frequency = defaultdict(int)
		
		for text in texts:
			for token in text:
				frequency[token] += 1
		
		texts = [[token for token in text if frequency[token] > 1] for text in texts]
		dictionary = corpora.Dictionary(texts)
		dictionary.save(doc2vecpathDict)
		corpus = [dictionary.doc2bow(text) for text in texts]
		gensim.corpora.mmcorpus.MmCorpus.serialize(doc2vecpathCorp, corpus)
		#gensim.corpora.mmcorpus.MmCorpus.save(doc2vecpathCorp, corpus)
		
		#lda = gensim.models.LdaModel(corpus, id2word=dictionary, alpha='auto', num_topics=30, chunksize=100000, passes=2)

		lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=30, chunksize=100000)
		# index = similarities.MatrixSimilarity(lsi[corpus], num_features=len(corpus)) #  # transform corpus to LSI space and index it
		lsi.save(doc2vecpath)
		
		vec_bow1 = dictionary.doc2bow(markov_sentence.lower().split())
		vec_lsi1 = lsi[vec_bow1] 
		#sims1 	= index[vec_lsi1] # perform a similarity query against the corpus
		
		try:
			if vec_lsi1[0][1]:
				return vec_lsi1[0][1]
			else:
				return 0.0
		except Exception:
			return 0.0

def CalcWordSimilarity(documents, markov_sentence, train=True):
	#https://radimrehurek.com/gensim/models/phrases.html#module-gensim.models.phrases
	# Gensim Phrases bilden Bi  und Trigramme ab
	# # # n_similarity(ws1, ws2)¶
	###### score(sentences, total_sentences=1000000, chunksize=100, queue_factor=2, report_delay=1) --> https://radimrehurek.com/gensim/models/word2vec.html
	
	if train and os.path.isfile(word2vecpath):
		model 	= gensim.models.Word2Vec.load(word2vecpath)
		model 	= gensim.models.Word2Vec(documents, sg=1, hs=1, iter=150, size=800, workers=1, sorted_vocab=1, alpha=0.325, min_count=1)
		model.init_sims(replace=False) 	# can read,write from, and also training -> more memory
		#model.init_sims(replace=True)	# can only read from, but no more training -> less memory
		model.save(word2vecpath)
		return True
	elif train and not os.path.isfile(word2vecpath):
		model 	= gensim.models.Word2Vec(documents, sg=1, hs=1, iter=150, size=800, workers=1, sorted_vocab=1, alpha=0.325, min_count=1)
		model.init_sims(replace=False) 	# can read,write from, and also training -> more memory
		#model.init_sims(replace=True)	# can only read from, but no more training -> less memory
		model.save(word2vecpath)
		return True
	elif os.path.isfile(word2vecpath):
		model 	= gensim.models.Word2Vec.load(word2vecpath)
	else:
		model 	= gensim.models.Word2Vec(documents, sg=1, hs=1, iter=50, size=400, workers=1, sorted_vocab=1, alpha=0.325, min_count=1)
		model.init_sims(replace=False) 	# can read,write from, and also training -> more memory
		#model.init_sims(replace=True)	# can only read from, but no more training -> less memory
		model.save(word2vecpath)
		
	
	#markov = [markov_sentence]
	
	docs = [markov_sentence]
	
	with codecs.open(stopword,'r', encoding='utf-8') as f:
		stopwords_tmp = f.read()
	f.close()
	
	stopwords = set(stopwords_tmp.strip().split())
	
	texts = [[word for word in document.lower().split() if word not in stopwords] for document in docs]
	'''
	print("Markov Score:"),
	calc = model.score([markov_sentence.split()])
	print(calc)
	# documents
	
	print("Markov Score:"),
	calc = model.score([markov_sentence])
	print(calc)
	
	print("Standard Sentence Score:"),
	standardSentence		= u"Instagram Community trifft sich in Berlin und Frankfurt am Main - heute erklaeren wir euch, wie Ihr mehr Follower und mehr Fotos und Videos teilen koennt."
	calc = model.score([standardSentence])
	#calc = similarity
	print(calc)
	
	print("Direkt Corpus Sentence Score:"),
	sce		= u"Instagram aehnelt in vielerlei Hinsicht Twitter, unter anderem weil es wie der Kurznachrichtendienst die Moeglichkeit bietet, in Echtzeit mit der Community zu kommunizieren."
	calc = model.score([sce])
	#print("Syntax Calc:"),
	print(calc)
	'''
	
	print("Markov Text Score 1:"),
	calc = model.score([markov_sentence])
	print(calc)
	# documents
	
	print("Documents Score 2:"),
	calc = model.score([documents.split()])
	print(calc)
	
	calc = model.n_similarity(documents, markov_sentence)
	print("Markov Sentence Similiarity Score:"),
	print(calc)
	'''
	standardSentence		= u"Instagram Community trifft sich in Berlin und Frankfurt am Main - heute erklaeren wir euch, wie Ihr mehr Follower und mehr Fotos und Videos teilen koennt."
	calc = model.n_similarity(documents, standardSentence)
	print("Standard Sentence Similiarity Score:"),
	print(calc)
	
	#sce		= u"Instagram aehnelt in vielerlei Hinsicht Twitter, unter anderem weil es wie der Kurznachrichtendienst die Moeglichkeit bietet, in Echtzeit mit der Community zu kommunizieren."
	sce		= u"Biologisch betrachtet ist die DNA essentiell."
	calc = model.n_similarity(documents, sce)
	print("Direkt Corpus Sentence Similiarity Score:"),
	print(calc)
	'''
	# wmdistance Score von 0.0 entspricht identischen Dokumenten
	distance = model.wmdistance(documents, markov_sentence)
	print("Distance wmd Score:"),
	print(distance)
	return True