# -*- coding: utf-8 -*-
#!/usr/bin/python3.6 -S -W ignore

import os
import codecs
import os,sys
import gzip
import json
import re
import subprocess
from fasttext import train_supervised
import fasttext
import pprint

pp = pprint.PrettyPrinter(indent=4)

import spacy  # See "Installing spaCy"
nlp_de				= spacy.load('de_core_news_sm')
nlp_en 				= spacy.load('en_core_web_sm')
nlp_de.max_length 	= 10000000
nlp_en.max_length 	= 10000000

def _is_wordlike(tok):
	return tok.orth_ and tok.orth_[0].isalpha()

def sentence_division_suppresor(doc):
	"""Spacy pipeline component that prohibits sentence segmentation between two tokens that start with a letter.
	Useful for taming overzealous sentence segmentation in German model, possibly others as well."""
	for i, tok in enumerate(doc[:-1]):
		if _is_wordlike(tok) and _is_wordlike(doc[i + 1]):
			doc[i + 1].is_sent_start = False
	return doc

nlp_de.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser')
nlp_en.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser')

# cat cooking.stackexchange.txt | sed -e "s/([.\!?,'/()])/ 1 /g" | tr "[:upper:]" "[:lower:]" &gt; cooking.preprocessed.txt
storepathData	= "/home/unaiqueFRAMEWORK/data"
storepath 		= "/home/unaiqueFRAMEWORK/text_classify/fasttext/training_data"
managerDB		= storepath+"/manager_fasttext_train.txt"
parentsDB		= storepath+"/parents_fasttext_train.txt"
teenagerDB		= storepath+"/teenager_fasttext_train.txt"

manager 		= {"bloomberg":'bloomberg.com', "cio":'cio.com', "hbr":'hbr.org', "forbes":'forbes.com', 'foreignaffairs':'foreignaffairs.com', 'ceo':'ceo.com','chiefexecutive':'chiefexecutive.net', 'economist':'economist.com', 'strategy-business':'strategy-business.com','managementexchange':'managementexchange.com', 'real-leaders':'real-leaders.com', 'inc':'inc.com', 'n2growth':'n2growth.com', "wsj" : 'wsj.com'}
parents 		= {"babble" : 'babble.com', "alphamom":'alphamom.com'}
teenager 		= {"teenink" : 'teenink.com', 'teenreads':'teenreads.com','girlscouts':'girlscouts.org','teenreads':'teenreads.com','teenkidsnews.com':'teenkidsnews','teensource':'teensource.org','teenvogue':'teenvogue.com','teenspeak':'teenspeak.org','teensgotcents':'teensgotcents.com','theteenagertoday':'theteenagertoday.com'}

#https://gist.github.com/bittlingmayer/7139a6a75ba0dbbc3a06325394ae3a13
SUBEXES = ["s/’/'/g", "s/′/'/g", "s/''/ /g", "s/'/ ' /g", 's/“/"/g', 's/”/"/g', 's/"/ /g', "s/\\./ \\. /g", "s/<br \\/>/ /g", "s/, / , /g", "s/(/ ( /g", "s/)/ ) /g", "s/\\!/ \\! /g", "s/\\?/ \\? /g", "s/\\;/ /g", "s/\\:/ /g", "s/-/ - /g", "s/=/ /g", "s/=/ /g", "s/*/ /g", "s/|/ /g", "s/«/ /g"]


model 		= fasttext.load_model("/home/unaiqueFRAMEWORK/new_prototyp/fasttext/binary/unaique_full.bin")
db_file		= "/home/unaiqueFRAMEWORK/new_prototyp/fasttext/trainingsdata/train/manager_aa"	# Training on new Sample
csv_file	= "/home/unaiqueFRAMEWORK/new_prototyp/fasttext/testing_manager.csv"	# Training on new Sample
current		= "manager"

w=codecs.open(csv_file, 'w', encoding='utf-8')
w.write("is_label;predict_labal;is_probability;is_entry\n")
# Open one result for processing
with codecs.open(db_file, 'r', encoding='ascii') as fr:
	#text 	= fr.read().replace('\n', '').strip().lower()
	text 	= fr.readline()
fr.close()
print("Read File complete! Entries:", len(text))
#exit(1)

count = 0
for t in text:
	#t 			= t.replace('\n', '').strip().lower()
	data3 		= re.sub("^[A-Za-z0-9,.!?:;]", "", t)
	try:
		p			= model.predict(data3, k=1, threshold=0.0)
		label 		= str(p[0][0])
		probability = str(float(p[1].tolist()[0]))
		print("Entries:", len(text))
		print("Predicted:",label)
		print("Probability:",probability)
		print("Current:",current)
		print("Entry:",count)
		print("##########################")
		#w.write("is_label;predict_labal;is_probability;is_entry")
		count = count + 1
		w.write(current+";"+label+";"+probability+";"+str(count)+"\n")
	except Exception as err2:
		1
	

w.close()
exit(1)

def __normalize_text(s):
	for subex in SUBEXES:
		s = subprocess.check_output(['sed', subex], input=s.encode("latin-1")).decode("utf-8")	# s = subprocess.check_output(['sed', subex], input=s.encode("iso-8859-1")).decode("utf-8")
	return s

def __spaces(s):
    return ' '.join(s.split())

def __digits(s):
    return ''.join(filter(lambda c: not c.isdigit(), s))

def preproc(s):
    return __digits(__spaces(s.lower()))

def beautifyCorpus(text):
	rList				= list()
	nlp_de.max_length 	= len(text) + 1
	doc 				= nlp_de(text)
	for sent in doc.sents:
		s				= str(sent)
		flag			= isDublicateString(s)
		flag2			= isGoodCasing(s)
		if not flag and flag2:
			rList.append(s)
			
	return " ".join(rList)
		
def isDublicateString(text):
	t_Text 			= text.lower()
	#debug 			= True
	patternElement 	= ""
	n				= 16# neun zeichen müssen gleich sein
       
	if len(t_Text) >= n+3:
		patternElement = t_Text[:n]
		patternElement = patternElement.strip()
		#print("PatternElement:", patternElement)
		r_Count = t_Text.count(patternElement)
		if r_Count > 1:
			#if debug:
			#	print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", patternElement)
			return True
	
	#satz = "Breath of the WildThe Legend of Zelda: Breath of the Wild auf der E3 angespielt."
	#['breath of th', 'e wildthe le', 'gend of zeld', 'a: breath of', ' the wild au', 'f der e3 ang', 'espielt.']
	t_List = [t_Text[i:i+n] for i in range(0, len(t_Text), n)]
	for t in t_List:
		t = t.strip()
		r_Count = t_Text.count(t)
		if r_Count > 1:
			#if debug:
			#	print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", t)
			return True
	
	return False

def isGoodCasing(text):
	if not isinstance(text, str):
		return False
	
	strString 	= text #str(text,'utf-8')
	i=0
	try:
		for c in strString:
			if c in [u'ß',u'ö',u'ä',u'ü',u'Ö',u'Ä',u'Ü']:
				1
			elif c != " " and len(strString) >=i+1:
				if c.islower() and strString[i+1].isupper():
					# zuviel positive falses
					#print("sentify.isGoodCasing(): -> aA Error")
					#aA
					return False
				if c.isdigit() and (strString[i+1].isupper() and strString[i+1].isalpha()):
					#if debug:
					#	print("sentify.isGoodCasing(): -> 2P Error")
					return False
				elif c.isdigit() and strString[i+1].isalpha():
					# to much positive falses
					#print("sentify.isGoodCasing(): -> 2a Error")
					#2a -> Achtung, eventuell bei Adressen als Ausgabe gibt es Probleme
					1# return False
				elif c.isdigit() and strString[i+1].isdigit():
					1 # 11
				elif c.isalpha() and strString[i+1].isdigit():
					#if debug:
					#	print("sentify.isGoodCasing(): -> a2 Error")
					# a2
					return False
				elif c.isdigit() and strString[i+1] not in ['.','!','?','"','-','\'', ':',' ']:
					#if debug:
					#	print("sentify.isGoodCasing -> text.lower()/upper() Error:")
					return False
		i=i+1
	except:
		1
	
	return True

def strip_accents(text):
	return "".join(char for char in unicodedata.normalize('NFKD', text) if unicodedata.category(char) != 'Mn')

def parse_html(page):
	""" Clean HTML tags for webpages that aren't Gutenberg books """
	try:
		# https://github.com/miso-belica/jusText/tree/dev/justext/stoplists
		parts = justext.justext(page, justext.get_stoplist('English'))
		#parts = justext.justext(page, justext.get_stoplist('German'))
		#print(parts)
		paragraphs = list()
		for part in parts:
			#pp.pprint(part.is_boilerplate)
			if not part.is_boilerplate:
				#pp.pprint(part)
				paragraphs.append(part.text)
		s=str('\n\n'.join(paragraphs))
		if len(s) > 50:
			return s
		else:
			soup 			= BeautifulSoup(page, "html.parser") 				#"html.parser" "lxml")
			#comments 		= soup.findAll(text=lambda text:isinstance(text, Comment)) 	# remove comments
			#[comment.extract() for comment in comments]
			for script in soup.findAll(["script", "style", 'footer', 'head']):
				#script.extract()		# rip it out
				script.decompose()		# rip it out
			#[x.extract() for x in soup.findAll(['script', 'style'])]
			#[x.decompose() for x in soup.findAll(['script', 'style'])]
			soup.prettify()
			myText 			= soup.get_text()
			plaintextv1		= bleach.clean(myText, strip=True, strip_comments=True)
			plaintext		= re.sub(r'<.*?>', '', plaintextv1)
			#plaintext		= plaintext.replace('\n', ' ')
			#plaintext		= plaintext.replace("\n", ' ')
			#plaintext		= textify.removeDomainsSimple(plaintext)
			
			return plaintext#.strip()		
	
	except Exception as e:
		#print("Unexpected error:", sys.exc_info()[0])
		exc_type, exc_obj, exc_tb = sys.exc_info()
		fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
		#print(exc_type, fname, exc_tb.tb_lineno)
		
	return str("")

def encodeToASCII(text):
	encResults  = text.encode('utf-8', "ignore")
	s_string	= str(encResults.decode('ascii', "ignore"))
	#textv1 		= re_pattern.sub(u'\uFFFD', s_string)
	return s_string

def encodeToUTF8Adv(text):
	encResults 	= text.encode('utf-8', "ignore")
	#return str(encResults.decode('latin-1', "ignore"))
	s_string	= str(encResults.decode('utf-8', "remove"))
	#textv1 		= re_pattern.sub(u'\uFFFD', s_string)
	return s_string
	
def encodeToLatin1(text):
	#text 		= text.replace('ß','ss')
	encResults  = text.encode('utf-8', "ignore")
	#encResults = text.encode('utf-8', "ignore")
	s_string	= str(encResults.decode('latin-1', "ignore"))
	#textv1 		= re_pattern.sub(u'\uFFFD', s_string)
	return s_string

"""
	MAIN 
"""

"""
for key in manager:
	#print (key)				# key
	#print (manager[key])	# value
	mystorepath = storepathData+"/"+key
	if os.path.exists(mystorepath):
		folders = list()
		files 	= list()
		 
		for entry in os.scandir(mystorepath):
			if entry.is_dir():
				folders.append(entry.path)
			elif entry.is_file():
				files.append(entry.path)
		 
		for f in files:
			#print(f)
			with codecs.open(f, 'r', encoding='utf-8') as fr:
				data3 = fr.read().replace('\n', '').strip()
			fr.close()
			
			#tra		= transformText(h)
			#data2 = beautifyCorpus(data1)
			#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
			data3 = encodeToASCII(data3)
			#data3 = preproc(data3)
			if len(data3) > 50:
				with codecs.open(managerDB, 'a+', encoding='ascii') as fa:
					#fa.write("manager###"+tra+"\n")
					fa.write("__label__MANAGER"+" "+data3+"\n")
				fa.close()

for key in teenager:
	#print (key)				# key
	#print (manager[key])	# value
	mystorepath = storepathData+"/"+key
	if os.path.exists(mystorepath):
		folders = list()
		files 	= list()
		 
		for entry in os.scandir(mystorepath):
			if entry.is_dir():
				folders.append(entry.path)
			elif entry.is_file():
				files.append(entry.path)
		 
		for f in files:
			#print(f)
			with codecs.open(f, 'r', encoding='utf-8') as fr:
				data3 = fr.read().replace('\n', '').strip()
			fr.close()
			
			#tra		= transformText(h)
			#data2 = beautifyCorpus(data1)
			#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
			data3 = encodeToASCII(data3)
			#data3 = preproc(data3)
			if len(data3) > 50:
				with codecs.open(teenagerDB, 'a+', encoding='ascii') as fa:
					#fa.write("teenager###"+tra+"\n")
					fa.write("__label__TEENAGER"+" "+data3+"\n")
				fa.close()

for key in parents:
	#print (key)				# key
	#print (manager[key])	# value
	mystorepath = storepathData+"/"+key
	if os.path.exists(mystorepath):
		folders = list()
		files 	= list()
		 
		for entry in os.scandir(mystorepath):
			if entry.is_dir():
				folders.append(entry.path)
			elif entry.is_file():
				files.append(entry.path)
		 
		for f in files:
			#print(f)
			with codecs.open(f, 'r', encoding='utf-8') as fr:
				data3 = fr.read().replace('\n', '').strip()
			fr.close()
			
			#tra		= transformText(h)
			#data2 = beautifyCorpus(data1)
			#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
			data3 = encodeToASCII(data3)
			#data3 = preproc(data3)
			if len(data3) > 50:
				with codecs.open(parentsDB, 'a+', encoding='ascii') as fa:
					#fa.write("parents###"+tra+"\n")
					fa.write("__label__PARENTS"+" "+data3+"\n")
				fa.close()
			

folders = list()
files 	= list()
 
for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/manager/"):
	if entry.is_dir():
		folders.append(entry.path)
	elif entry.is_file():
		files.append(entry.path)
 
for f in files:
	if f.find("en")!= -1:
		with codecs.open(f, 'r', encoding='utf-8') as fr:
			data3 = fr.read().replace('\n', '').strip()
		fr.close()
		
		data3	= parse_html(data3)
		###tra		= transformText(h)
		#data2 = beautifyCorpus(h)
		#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
		data3 = encodeToASCII(data3)
		#data3 = preproc(data3)
		if len(data3) > 50:
			with codecs.open(managerDB, 'a+', encoding='ascii') as fa:
				#fa.write("manager###"+tra+"\n")
				fa.write("__label__MANAGER"+" "+data3+"\n")
			fa.close()


folders = list()
files 	= list()
 
for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/parents/"):
	if entry.is_dir():
		folders.append(entry.path)
	elif entry.is_file():
		files.append(entry.path)
 
for f in files:
	if f.find("en")!= -1:
		with codecs.open(f, 'r', encoding='utf-8') as fr:
			data3 = fr.read().replace('\n', '').strip()
		fr.close()
		
		data3	= parse_html(data3)
		###tra		= transformText(h)
		#data2 = beautifyCorpus(h)
		#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
		data3 = encodeToASCII(data3)
		#data3 = preproc(data3)
		if len(data3) > 50:
			with codecs.open(parentsDB, 'a+', encoding='ascii') as fa:
				#fa.write("parents###"+tra+"\n")
				fa.write("__label__PARENTS"+" "+data3+"\n")
			fa.close()

folders = list()
files 	= list()
 
for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/teenager/"):
	if entry.is_dir():
		folders.append(entry.path)
	elif entry.is_file():
		files.append(entry.path)
 
for f in files:
	if f.find("en")!= -1:
		with codecs.open(f, 'r', encoding='utf-8') as fr:
			data3 = fr.read().replace('\n', '').strip()
		fr.close()
		
		data3	= parse_html(data3)
		###tra		= transformText(h)
		#data2 = beautifyCorpus(h)
		#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
		data3 = encodeToASCII(data3)
		#data3 = preproc(data3)
		if len(data3) > 50:
			with codecs.open(teenagerDB, 'a+', encoding='ascii') as fa:
				#fa.write("teenager###"+tra+"\n")
				fa.write("__label__TEENAGER"+" "+data3+"\n")
			fa.close()

"""

"""
exit(1)
"""

"""
print( p[0][0] ) 
print( type(p[0][0]) )
print()
print()
print(  p[1].tolist() ) 
print( type(p[1].tolist()) )
"""
exit(1)

#fasttext.load_model(self.fasttext_model_file) 
#model.predict("Which baking dish is best to bake a banana bread ?", k=-1, threshold=0.5)
#model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000)
#model.save_model("/home/unaiqueFRAMEWORK/text_classify/fasttext/binary/unaique.ftz")