# -*- coding: utf-8 -*-
#!/usr/bin/python3.6 -S -W ignore

import os
import codecs
import os,sys
import gzip
import json
import re
import subprocess
from fasttext import train_supervised	# pip3 install -U fasttext
import fasttext
from gensim.models import FastText

from keras.preprocessing.text import Tokenizer
from gensim.models.fasttext import FastText
import numpy as np
import matplotlib.pyplot as plt
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

import re
from nltk.stem import WordNetLemmatizer
#import wikipedia
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

import spacy  # See "Installing spaCy"
nlp_de				= spacy.load('de_core_news_sm')
nlp_en 				= spacy.load('en_core_web_sm')
nlp_de.max_length 	= 10000000
nlp_en.max_length 	= 10000000

stemmer = WordNetLemmatizer()

def preprocess_text(document):
	# Remove all the special characters
	document = re.sub(r'\W', ' ', str(document))

	# remove all single characters
	document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

	# Remove single characters from the start
	document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

	# Substituting multiple spaces with single space
	document = re.sub(r'\s+', ' ', document, flags=re.I)

	# Removing prefixed 'b'
	document = re.sub(r'^b\s+', '', document)

	# Converting to Lowercase
	document = document.lower()

	# Lemmatization
	tokens = document.split()
	tokens = [stemmer.lemmatize(word) for word in tokens]
	tokens = [word for word in tokens if word not in en_stop]
	tokens = [word for word in tokens if len(word) > 3]

	preprocessed_text = ' '.join(tokens)

	return preprocessed_text
	
def _is_wordlike(tok):
	return tok.orth_ and tok.orth_[0].isalpha()

def sentence_division_suppresor(doc):
	"""Spacy pipeline component that prohibits sentence segmentation between two tokens that start with a letter.
	Useful for taming overzealous sentence segmentation in German model, possibly others as well."""
	for i, tok in enumerate(doc[:-1]):
		if _is_wordlike(tok) and _is_wordlike(doc[i + 1]):
			doc[i + 1].is_sent_start = False
	return doc

nlp_de.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser')
nlp_en.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser')


# via Shell filtern
#cat unaique.all.txt | sed -e "s/([.\!?,'/()])/ 1 /g" | tr "[:upper:]" "[:lower:]" >> unaique.preprocessed.txt

# https://github.com/facebookresearch/fastText/blob/master/docs/supervised-tutorial.md
# http://soner.in/fasttext-grid-search/

# https://github.com/facebookresearch/fastText/blob/master/python/fasttext_module/fasttext/FastText.py
#train_data_tmp="/home/unaiqueFRAMEWORK/new_prototyp/fasttext/trainingsdata/train/unaique.preprocessed.txt"
train_data="/home/unaiqueFRAMEWORK/new_prototyp/fasttext/trainingsdata/train/unaique.preprocessed.txt"

with codecs.open(train_data, 'r', encoding='utf-8') as fr:
	data1 = fr.read().replace('\n', '').strip()
	#data1 = fr.readlines()#.replace('\n', '').strip()
fr.close()
				
artificial_intelligence 	= sent_tokenize(data1)
final_corpus 				= [preprocess_text(sentence) for sentence in artificial_intelligence if sentence.strip() !='']

word_punctuation_tokenizer 	= nltk.WordPunctTokenizer()
word_tokenized_corpus 		= [word_punctuation_tokenizer.tokenize(sent) for sent in final_corpus]

embedding_size = 60
window_size = 40
min_word = 5
down_sampling = 1e-2

ft_model = FastText(word_tokenized_corpus,
                      size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      sg=1,
                      iter=100)
	
semantically_similar_words = {words: [item[0] for item in ft_model.wv.most_similar([words], topn=5)]
                  for words in ['artificial', 'intelligence', 'machine', 'network', 'recurrent', 'deep']}

for k,v in semantically_similar_words.items():
    print(k+":"+str(v))


print(ft_model.wv.similarity(w1='artificial', w2='intelligence'))
exit(1)
	
model = FastText(size=4, window=3, min_count=1)  # instantiate
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)  # train

"""
r=codecs.open(train_data_tmp, 'r', encoding='utf-8')
data3 	= r.read().replace('\n', '').strip().lower()
r.close()
data3 	= re.sub("^[A-Za-z0-9,.!?:;]", "", data3)

w=codecs.open(train_data, 'w', encoding='ascii')
w.write(data3)
w.close()
"""

#model = fasttext.train_supervised(train_data, model='skipgram', lr=0.05, dim=250, ws=5, epoch=50)
#model = fasttext.train_supervised(train_data, epoch=100, lr=0.25, wordNgrams=3, verbose=2, dim=300, ws=25, minCount=1, loss="softmax")
#model = fasttext.train_supervised(train_data, epoch=50, lr=0.1, wordNgrams=3, verbose=2, minCount=1, dim=200, ws=10, loss='hs', thread=10)
model = fasttext.train_supervised(train_data, epoch=100, lr=0.05, wordNgrams=3, thread=10, label_prefix='__label__')
model.save_model("/home/unaiqueFRAMEWORK/new_prototyp/fasttext/unaique_full.bin")

#train_data2="/home/unaiqueFRAMEWORK/new_prototyp/fasttext/trainingsdata/teenager_fasttext70.txt"
#model = fasttext.train_unsupervised(train_data2, model='skipgram', lr=0.05, dim=200, ws=5, epoch=50)
#model.save_model("/home/unaiqueFRAMEWORK/new_prototyp/fasttext/binary/teenager_fasttext.bin")

exit(1)
#fasttext.load_model(self.fasttext_model_file) 
#model.predict("Which baking dish is best to bake a banana bread ?", k=-1, threshold=0.5)
model.quantize(input=train_data, qnorm=True, retrain=False, cutoff=50000)
model.save_model("/home/unaiqueFRAMEWORK/new_prototyp/fasttext/unaique_small.ftz")

exit(1)

def __normalize_text(s):
	for subex in SUBEXES:
		s = subprocess.check_output(['sed', subex], input=s.encode("latin-1")).decode("utf-8")	# s = subprocess.check_output(['sed', subex], input=s.encode("iso-8859-1")).decode("utf-8")
	return s

def __spaces(s):
    return ' '.join(s.split())

def __digits(s):
    return ''.join(filter(lambda c: not c.isdigit(), s))

def preproc(s):
    return __digits(__spaces(s.lower()))

def beautifyCorpus(text):
	rList				= list()
	nlp_de.max_length 	= len(text) + 1
	doc 				= nlp_de(text)
	for sent in doc.sents:
		s				= str(sent)
		flag			= isDublicateString(s)
		flag2			= isGoodCasing(s)
		if not flag and flag2:
			rList.append(s)
			
	return " ".join(rList)
		
def isDublicateString(text):
	t_Text 			= text.lower()
	#debug 			= True
	patternElement 	= ""
	n				= 16# neun zeichen müssen gleich sein
       
	if len(t_Text) >= n+3:
		patternElement = t_Text[:n]
		patternElement = patternElement.strip()
		#print("PatternElement:", patternElement)
		r_Count = t_Text.count(patternElement)
		if r_Count > 1:
			#if debug:
			#	print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", patternElement)
			return True
	
	#satz = "Breath of the WildThe Legend of Zelda: Breath of the Wild auf der E3 angespielt."
	#['breath of th', 'e wildthe le', 'gend of zeld', 'a: breath of', ' the wild au', 'f der e3 ang', 'espielt.']
	t_List = [t_Text[i:i+n] for i in range(0, len(t_Text), n)]
	for t in t_List:
		t = t.strip()
		r_Count = t_Text.count(t)
		if r_Count > 1:
			#if debug:
			#	print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", t)
			return True
	
	return False

def isGoodCasing(text):
	if not isinstance(text, str):
		return False
	
	strString 	= text #str(text,'utf-8')
	i=0
	try:
		for c in strString:
			if c in [u'ß',u'ö',u'ä',u'ü',u'Ö',u'Ä',u'Ü']:
				1
			elif c != " " and len(strString) >=i+1:
				if c.islower() and strString[i+1].isupper():
					# zuviel positive falses
					#print("sentify.isGoodCasing(): -> aA Error")
					#aA
					return False
				if c.isdigit() and (strString[i+1].isupper() and strString[i+1].isalpha()):
					#if debug:
					#	print("sentify.isGoodCasing(): -> 2P Error")
					return False
				elif c.isdigit() and strString[i+1].isalpha():
					# to much positive falses
					#print("sentify.isGoodCasing(): -> 2a Error")
					#2a -> Achtung, eventuell bei Adressen als Ausgabe gibt es Probleme
					1# return False
				elif c.isdigit() and strString[i+1].isdigit():
					1 # 11
				elif c.isalpha() and strString[i+1].isdigit():
					#if debug:
					#	print("sentify.isGoodCasing(): -> a2 Error")
					# a2
					return False
				elif c.isdigit() and strString[i+1] not in ['.','!','?','"','-','\'', ':',' ']:
					#if debug:
					#	print("sentify.isGoodCasing -> text.lower()/upper() Error:")
					return False
		i=i+1
	except:
		1
	
	return True

def strip_accents(text):
	return "".join(char for char in unicodedata.normalize('NFKD', text) if unicodedata.category(char) != 'Mn')

def parse_html(page):
	""" Clean HTML tags for webpages that aren't Gutenberg books """
	try:
		# https://github.com/miso-belica/jusText/tree/dev/justext/stoplists
		parts = justext.justext(page, justext.get_stoplist('English'))
		#parts = justext.justext(page, justext.get_stoplist('German'))
		#print(parts)
		paragraphs = list()
		for part in parts:
			#pp.pprint(part.is_boilerplate)
			if not part.is_boilerplate:
				#pp.pprint(part)
				paragraphs.append(part.text)
		s=str('\n\n'.join(paragraphs))
		if len(s) > 50:
			return s
		else:
			soup 			= BeautifulSoup(page, "html.parser") 				#"html.parser" "lxml")
			#comments 		= soup.findAll(text=lambda text:isinstance(text, Comment)) 	# remove comments
			#[comment.extract() for comment in comments]
			for script in soup.findAll(["script", "style", 'footer', 'head']):
				#script.extract()		# rip it out
				script.decompose()		# rip it out
			#[x.extract() for x in soup.findAll(['script', 'style'])]
			#[x.decompose() for x in soup.findAll(['script', 'style'])]
			soup.prettify()
			myText 			= soup.get_text()
			plaintextv1		= bleach.clean(myText, strip=True, strip_comments=True)
			plaintext		= re.sub(r'<.*?>', '', plaintextv1)
			#plaintext		= plaintext.replace('\n', ' ')
			#plaintext		= plaintext.replace("\n", ' ')
			#plaintext		= textify.removeDomainsSimple(plaintext)
			
			return plaintext#.strip()		
	
	except Exception as e:
		#print("Unexpected error:", sys.exc_info()[0])
		exc_type, exc_obj, exc_tb = sys.exc_info()
		fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
		#print(exc_type, fname, exc_tb.tb_lineno)
		
	return str("")

def encodeToASCII(text):
	encResults  = text.encode('utf-8', "ignore")
	s_string	= str(encResults.decode('ascii', "ignore"))
	#textv1 		= re_pattern.sub(u'\uFFFD', s_string)
	return s_string

def encodeToUTF8Adv(text):
	encResults 	= text.encode('utf-8', "ignore")
	#return str(encResults.decode('latin-1', "ignore"))
	s_string	= str(encResults.decode('utf-8', "remove"))
	#textv1 		= re_pattern.sub(u'\uFFFD', s_string)
	return s_string
	
def encodeToLatin1(text):
	#text 		= text.replace('ß','ss')
	encResults  = text.encode('utf-8', "ignore")
	#encResults = text.encode('utf-8', "ignore")
	s_string	= str(encResults.decode('latin-1', "ignore"))
	#textv1 		= re_pattern.sub(u'\uFFFD', s_string)
	return s_string

"""
	MAIN 
"""

"""
for key in manager:
	#print (key)				# key
	#print (manager[key])	# value
	mystorepath = storepathData+"/"+key
	if os.path.exists(mystorepath):
		folders = list()
		files 	= list()
		 
		for entry in os.scandir(mystorepath):
			if entry.is_dir():
				folders.append(entry.path)
			elif entry.is_file():
				files.append(entry.path)
		 
		for f in files:
			#print(f)
			with codecs.open(f, 'r', encoding='utf-8') as fr:
				data3 = fr.read().replace('\n', '').strip()
			fr.close()
			
			#tra		= transformText(h)
			#data2 = beautifyCorpus(data1)
			#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
			data3 = encodeToASCII(data3)
			#data3 = preproc(data3)
			if len(data3) > 50:
				with codecs.open(managerDB, 'a+', encoding='ascii') as fa:
					#fa.write("manager###"+tra+"\n")
					fa.write("__label__MANAGER"+" "+data3+"\n")
				fa.close()

for key in teenager:
	#print (key)				# key
	#print (manager[key])	# value
	mystorepath = storepathData+"/"+key
	if os.path.exists(mystorepath):
		folders = list()
		files 	= list()
		 
		for entry in os.scandir(mystorepath):
			if entry.is_dir():
				folders.append(entry.path)
			elif entry.is_file():
				files.append(entry.path)
		 
		for f in files:
			#print(f)
			with codecs.open(f, 'r', encoding='utf-8') as fr:
				data3 = fr.read().replace('\n', '').strip()
			fr.close()
			
			#tra		= transformText(h)
			#data2 = beautifyCorpus(data1)
			#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
			data3 = encodeToASCII(data3)
			#data3 = preproc(data3)
			if len(data3) > 50:
				with codecs.open(teenagerDB, 'a+', encoding='ascii') as fa:
					#fa.write("teenager###"+tra+"\n")
					fa.write("__label__TEENAGER"+" "+data3+"\n")
				fa.close()

for key in parents:
	#print (key)				# key
	#print (manager[key])	# value
	mystorepath = storepathData+"/"+key
	if os.path.exists(mystorepath):
		folders = list()
		files 	= list()
		 
		for entry in os.scandir(mystorepath):
			if entry.is_dir():
				folders.append(entry.path)
			elif entry.is_file():
				files.append(entry.path)
		 
		for f in files:
			#print(f)
			with codecs.open(f, 'r', encoding='utf-8') as fr:
				data3 = fr.read().replace('\n', '').strip()
			fr.close()
			
			#tra		= transformText(h)
			#data2 = beautifyCorpus(data1)
			#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
			data3 = encodeToASCII(data3)
			#data3 = preproc(data3)
			if len(data3) > 50:
				with codecs.open(parentsDB, 'a+', encoding='ascii') as fa:
					#fa.write("parents###"+tra+"\n")
					fa.write("__label__PARENTS"+" "+data3+"\n")
				fa.close()
			

folders = list()
files 	= list()
 
for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/manager/"):
	if entry.is_dir():
		folders.append(entry.path)
	elif entry.is_file():
		files.append(entry.path)
 
for f in files:
	if f.find("en")!= -1:
		with codecs.open(f, 'r', encoding='utf-8') as fr:
			data3 = fr.read().replace('\n', '').strip()
		fr.close()
		
		data3	= parse_html(data3)
		###tra		= transformText(h)
		#data2 = beautifyCorpus(h)
		#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
		data3 = encodeToASCII(data3)
		#data3 = preproc(data3)
		if len(data3) > 50:
			with codecs.open(managerDB, 'a+', encoding='ascii') as fa:
				#fa.write("manager###"+tra+"\n")
				fa.write("__label__MANAGER"+" "+data3+"\n")
			fa.close()


folders = list()
files 	= list()
 
for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/parents/"):
	if entry.is_dir():
		folders.append(entry.path)
	elif entry.is_file():
		files.append(entry.path)
 
for f in files:
	if f.find("en")!= -1:
		with codecs.open(f, 'r', encoding='utf-8') as fr:
			data3 = fr.read().replace('\n', '').strip()
		fr.close()
		
		data3	= parse_html(data3)
		###tra		= transformText(h)
		#data2 = beautifyCorpus(h)
		#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
		data3 = encodeToASCII(data3)
		#data3 = preproc(data3)
		if len(data3) > 50:
			with codecs.open(parentsDB, 'a+', encoding='ascii') as fa:
				#fa.write("parents###"+tra+"\n")
				fa.write("__label__PARENTS"+" "+data3+"\n")
			fa.close()

folders = list()
files 	= list()
 
for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/teenager/"):
	if entry.is_dir():
		folders.append(entry.path)
	elif entry.is_file():
		files.append(entry.path)
 
for f in files:
	if f.find("en")!= -1:
		with codecs.open(f, 'r', encoding='utf-8') as fr:
			data3 = fr.read().replace('\n', '').strip()
		fr.close()
		
		data3	= parse_html(data3)
		###tra		= transformText(h)
		#data2 = beautifyCorpus(h)
		#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
		data3 = encodeToASCII(data3)
		#data3 = preproc(data3)
		if len(data3) > 50:
			with codecs.open(teenagerDB, 'a+', encoding='ascii') as fa:
				#fa.write("teenager###"+tra+"\n")
				fa.write("__label__TEENAGER"+" "+data3+"\n")
			fa.close()

"""

"""
exit(1)
"""