# -*- coding: utf-8 -*-
#!/usr/bin/python3.6 -S -W ignore

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from tabulate import tabulate
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
import logging
import datetime
import pprint
import codecs
import numpy as np
import dill # wichtig für joblib load and dump
import os
import math

import nltk                       # the natural langauage toolkit, open-source NLP
import pandas as pd               # pandas dataframe
import re                         # regular expression
from nltk.corpus import stopwords  
from gensim.utils import lemmatize
from gensim import parsing        # Help in preprocessing the data, very efficiently
import gensim
import numpy as np
import pprint
import lxml
import codecs
import os,sys
import gzip
import json
import os

import os
import codecs
import os,sys
import gzip
import json
import re
import subprocess
from fasttext import train_supervised
import fasttext
import pprint

pp = pprint.PrettyPrinter(indent=4)

import spacy  # See "Installing spaCy"
nlp_de				= spacy.load('de_core_news_sm')
nlp_en 				= spacy.load('en_core_web_sm')
nlp_de.max_length 	= 10000000
nlp_en.max_length 	= 10000000

def _is_wordlike(tok):
	return tok.orth_ and tok.orth_[0].isalpha()

def sentence_division_suppresor(doc):
	"""Spacy pipeline component that prohibits sentence segmentation between two tokens that start with a letter.
	Useful for taming overzealous sentence segmentation in German model, possibly others as well."""
	for i, tok in enumerate(doc[:-1]):
		if _is_wordlike(tok) and _is_wordlike(doc[i + 1]):
			doc[i + 1].is_sent_start = False
	return doc

nlp_de.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser')
nlp_en.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser')

def transformText(text):
	# https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#stanfordcorenlplemmatization

	stops = set(stopwords.words("english"))

	# Convert text to lower
	text = text.lower()
	
	# Removing non ASCII chars    
	text = re.sub(r'[^\x00-\x7f]',r' ',text)

	# Strip multiple whitespaces
	text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)

	# Removing all the stopwords
	filtered_words = [word for word in text.split() if word not in stops]

	# Removing all the tokens with lesser than 3 characters
	filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)

	# Preprocessed text after stop words removal
	text = " ".join(filtered_words)

	# Remove the punctuation
	text = gensim.parsing.preprocessing.strip_punctuation2(text)

	# Strip all the numerics
	text = gensim.parsing.preprocessing.strip_numeric(text)

	# Strip multiple whitespaces
	text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
		
	# Stemming
	#return gensim.parsing.preprocessing.stem_text(text)

	# Lemmatize
	t = [wd.decode('utf-8').split('/')[0] for wd in gensim.utils.lemmatize(text)]
	return str(" ".join(t))

# cat cooking.stackexchange.txt | sed -e "s/([.\!?,'/()])/ 1 /g" | tr "[:upper:]" "[:lower:]" &gt; cooking.preprocessed.txt
storepathData	= "/home/unaiqueFRAMEWORK/data"
storepath 		= "/home/unaiqueFRAMEWORK/text_classify/fasttext/training_data"
managerDB		= storepath+"/manager_fasttext_train.txt"
parentsDB		= storepath+"/parents_fasttext_train.txt"
teenagerDB		= storepath+"/teenager_fasttext_train.txt"

manager 		= {"bloomberg":'bloomberg.com', "cio":'cio.com', "hbr":'hbr.org', "forbes":'forbes.com', 'foreignaffairs':'foreignaffairs.com', 'ceo':'ceo.com','chiefexecutive':'chiefexecutive.net', 'economist':'economist.com', 'strategy-business':'strategy-business.com','managementexchange':'managementexchange.com', 'real-leaders':'real-leaders.com', 'inc':'inc.com', 'n2growth':'n2growth.com', "wsj" : 'wsj.com'}
parents 		= {"babble" : 'babble.com', "alphamom":'alphamom.com'}
teenager 		= {"teenink" : 'teenink.com', 'teenreads':'teenreads.com','girlscouts':'girlscouts.org','teenreads':'teenreads.com','teenkidsnews.com':'teenkidsnews','teensource':'teensource.org','teenvogue':'teenvogue.com','teenspeak':'teenspeak.org','teensgotcents':'teensgotcents.com','theteenagertoday':'theteenagertoday.com'}

#https://gist.github.com/bittlingmayer/7139a6a75ba0dbbc3a06325394ae3a13
SUBEXES = ["s/’/'/g", "s/′/'/g", "s/''/ /g", "s/'/ ' /g", 's/“/"/g', 's/”/"/g', 's/"/ /g', "s/\\./ \\. /g", "s/<br \\/>/ /g", "s/, / , /g", "s/(/ ( /g", "s/)/ ) /g", "s/\\!/ \\! /g", "s/\\?/ \\? /g", "s/\\;/ /g", "s/\\:/ /g", "s/-/ - /g", "s/=/ /g", "s/=/ /g", "s/*/ /g", "s/|/ /g", "s/«/ /g"]

model 		= fasttext.load_model("/home/unaiqueFRAMEWORK/new_prototyp/fasttext/binary/unaique_fullTest2.bin")
db_file		= "/home/unaiqueFRAMEWORK/new_prototyp/trainingsdata/train/manager_ab"	# Training on new Sample
csv_file	= "/home/unaiqueFRAMEWORK/new_prototyp/fasttext/testing_manager.csv"	# Training on new Sample
result_file = "/home/unaiqueFRAMEWORK/new_prototyp/fasttext/result_30_manager.txt"
result_data_file = "/home/unaiqueFRAMEWORK/new_prototyp/fasttext/result_30_manager_data.txt"
current		= "manager"

w=codecs.open(csv_file, 'w', encoding='utf-8')
w.write("is_label;predict_labal;is_probability;is_entry\n")
# Open one result for processing
with codecs.open(db_file, 'r', encoding='ascii') as fr:
	#text 	= fr.read().replace('\n', '').strip().lower()
	text 	= fr.read().lower().split('\n')
	#text 	= fr.readline()
fr.close()
print("Read File complete! Entries:", len(text))
#exit(1)

count = 0
for t in text:
	#print(t)
	docs		= transformText(t) # für t_manager.txt auskommentiert
	#t 			= t.replace('\n', '').strip().lower()
	#data3 		= re.sub("^[A-Za-z0-9,.!?:;]", "", t)
	#try:
	p			= model.predict(docs, k=1, threshold=0.0)
	label 		= str(p[0][0])
	probability = str(float(p[1].tolist()[0]))
	print("Entries:", len(text))
	print("Predicted:",label)
	print("Probability:",probability)
	print("Current:",current)
	print("Entry:",count)
	print("##########################")
	#w.write("is_label;predict_labal;is_probability;is_entry")
	count = count + 1
	w.write(current+";"+label+";"+probability+";"+str(count)+"\n")
	
	with codecs.open(result_file, 'a+', encoding='utf-8') as fa:
		fa.write(label+";"+str(probability)+"\n")
	fa.close()

	if label == "__label__TEENAGER":
		with codecs.open(result_data_file, 'a+', encoding='utf-8') as fa:
			fa.write(str(t)+"\n\n\n")
		fa.close()
			
	#except Exception as err2:
	#	1
	

w.close()
exit(1)

def __normalize_text(s):
	for subex in SUBEXES:
		s = subprocess.check_output(['sed', subex], input=s.encode("latin-1")).decode("utf-8")	# s = subprocess.check_output(['sed', subex], input=s.encode("iso-8859-1")).decode("utf-8")
	return s

def __spaces(s):
    return ' '.join(s.split())

def __digits(s):
    return ''.join(filter(lambda c: not c.isdigit(), s))

def preproc(s):
    return __digits(__spaces(s.lower()))

def beautifyCorpus(text):
	rList				= list()
	nlp_de.max_length 	= len(text) + 1
	doc 				= nlp_de(text)
	for sent in doc.sents:
		s				= str(sent)
		flag			= isDublicateString(s)
		flag2			= isGoodCasing(s)
		if not flag and flag2:
			rList.append(s)
			
	return " ".join(rList)
		
def isDublicateString(text):
	t_Text 			= text.lower()
	#debug 			= True
	patternElement 	= ""
	n				= 16# neun zeichen müssen gleich sein
       
	if len(t_Text) >= n+3:
		patternElement = t_Text[:n]
		patternElement = patternElement.strip()
		#print("PatternElement:", patternElement)
		r_Count = t_Text.count(patternElement)
		if r_Count > 1:
			#if debug:
			#	print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", patternElement)
			return True
	
	#satz = "Breath of the WildThe Legend of Zelda: Breath of the Wild auf der E3 angespielt."
	#['breath of th', 'e wildthe le', 'gend of zeld', 'a: breath of', ' the wild au', 'f der e3 ang', 'espielt.']
	t_List = [t_Text[i:i+n] for i in range(0, len(t_Text), n)]
	for t in t_List:
		t = t.strip()
		r_Count = t_Text.count(t)
		if r_Count > 1:
			#if debug:
			#	print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", t)
			return True
	
	return False

def isGoodCasing(text):
	if not isinstance(text, str):
		return False
	
	strString 	= text #str(text,'utf-8')
	i=0
	try:
		for c in strString:
			if c in [u'ß',u'ö',u'ä',u'ü',u'Ö',u'Ä',u'Ü']:
				1
			elif c != " " and len(strString) >=i+1:
				if c.islower() and strString[i+1].isupper():
					# zuviel positive falses
					#print("sentify.isGoodCasing(): -> aA Error")
					#aA
					return False
				if c.isdigit() and (strString[i+1].isupper() and strString[i+1].isalpha()):
					#if debug:
					#	print("sentify.isGoodCasing(): -> 2P Error")
					return False
				elif c.isdigit() and strString[i+1].isalpha():
					# to much positive falses
					#print("sentify.isGoodCasing(): -> 2a Error")
					#2a -> Achtung, eventuell bei Adressen als Ausgabe gibt es Probleme
					1# return False
				elif c.isdigit() and strString[i+1].isdigit():
					1 # 11
				elif c.isalpha() and strString[i+1].isdigit():
					#if debug:
					#	print("sentify.isGoodCasing(): -> a2 Error")
					# a2
					return False
				elif c.isdigit() and strString[i+1] not in ['.','!','?','"','-','\'', ':',' ']:
					#if debug:
					#	print("sentify.isGoodCasing -> text.lower()/upper() Error:")
					return False
		i=i+1
	except:
		1
	
	return True

def strip_accents(text):
	return "".join(char for char in unicodedata.normalize('NFKD', text) if unicodedata.category(char) != 'Mn')

def parse_html(page):
	""" Clean HTML tags for webpages that aren't Gutenberg books """
	try:
		# https://github.com/miso-belica/jusText/tree/dev/justext/stoplists
		parts = justext.justext(page, justext.get_stoplist('English'))
		#parts = justext.justext(page, justext.get_stoplist('German'))
		#print(parts)
		paragraphs = list()
		for part in parts:
			#pp.pprint(part.is_boilerplate)
			if not part.is_boilerplate:
				#pp.pprint(part)
				paragraphs.append(part.text)
		s=str('\n\n'.join(paragraphs))
		if len(s) > 50:
			return s
		else:
			soup 			= BeautifulSoup(page, "html.parser") 				#"html.parser" "lxml")
			#comments 		= soup.findAll(text=lambda text:isinstance(text, Comment)) 	# remove comments
			#[comment.extract() for comment in comments]
			for script in soup.findAll(["script", "style", 'footer', 'head']):
				#script.extract()		# rip it out
				script.decompose()		# rip it out
			#[x.extract() for x in soup.findAll(['script', 'style'])]
			#[x.decompose() for x in soup.findAll(['script', 'style'])]
			soup.prettify()
			myText 			= soup.get_text()
			plaintextv1		= bleach.clean(myText, strip=True, strip_comments=True)
			plaintext		= re.sub(r'<.*?>', '', plaintextv1)
			#plaintext		= plaintext.replace('\n', ' ')
			#plaintext		= plaintext.replace("\n", ' ')
			#plaintext		= textify.removeDomainsSimple(plaintext)
			
			return plaintext#.strip()		
	
	except Exception as e:
		#print("Unexpected error:", sys.exc_info()[0])
		exc_type, exc_obj, exc_tb = sys.exc_info()
		fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
		#print(exc_type, fname, exc_tb.tb_lineno)
		
	return str("")

def encodeToASCII(text):
	encResults  = text.encode('utf-8', "ignore")
	s_string	= str(encResults.decode('ascii', "ignore"))
	#textv1 		= re_pattern.sub(u'\uFFFD', s_string)
	return s_string

def encodeToUTF8Adv(text):
	encResults 	= text.encode('utf-8', "ignore")
	#return str(encResults.decode('latin-1', "ignore"))
	s_string	= str(encResults.decode('utf-8', "remove"))
	#textv1 		= re_pattern.sub(u'\uFFFD', s_string)
	return s_string
	
def encodeToLatin1(text):
	#text 		= text.replace('ß','ss')
	encResults  = text.encode('utf-8', "ignore")
	#encResults = text.encode('utf-8', "ignore")
	s_string	= str(encResults.decode('latin-1', "ignore"))
	#textv1 		= re_pattern.sub(u'\uFFFD', s_string)
	return s_string

"""
	MAIN 
"""

"""
for key in manager:
	#print (key)				# key
	#print (manager[key])	# value
	mystorepath = storepathData+"/"+key
	if os.path.exists(mystorepath):
		folders = list()
		files 	= list()
		 
		for entry in os.scandir(mystorepath):
			if entry.is_dir():
				folders.append(entry.path)
			elif entry.is_file():
				files.append(entry.path)
		 
		for f in files:
			#print(f)
			with codecs.open(f, 'r', encoding='utf-8') as fr:
				data3 = fr.read().replace('\n', '').strip()
			fr.close()
			
			#tra		= transformText(h)
			#data2 = beautifyCorpus(data1)
			#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
			data3 = encodeToASCII(data3)
			#data3 = preproc(data3)
			if len(data3) > 50:
				with codecs.open(managerDB, 'a+', encoding='ascii') as fa:
					#fa.write("manager###"+tra+"\n")
					fa.write("__label__MANAGER"+" "+data3+"\n")
				fa.close()

for key in teenager:
	#print (key)				# key
	#print (manager[key])	# value
	mystorepath = storepathData+"/"+key
	if os.path.exists(mystorepath):
		folders = list()
		files 	= list()
		 
		for entry in os.scandir(mystorepath):
			if entry.is_dir():
				folders.append(entry.path)
			elif entry.is_file():
				files.append(entry.path)
		 
		for f in files:
			#print(f)
			with codecs.open(f, 'r', encoding='utf-8') as fr:
				data3 = fr.read().replace('\n', '').strip()
			fr.close()
			
			#tra		= transformText(h)
			#data2 = beautifyCorpus(data1)
			#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
			data3 = encodeToASCII(data3)
			#data3 = preproc(data3)
			if len(data3) > 50:
				with codecs.open(teenagerDB, 'a+', encoding='ascii') as fa:
					#fa.write("teenager###"+tra+"\n")
					fa.write("__label__TEENAGER"+" "+data3+"\n")
				fa.close()

for key in parents:
	#print (key)				# key
	#print (manager[key])	# value
	mystorepath = storepathData+"/"+key
	if os.path.exists(mystorepath):
		folders = list()
		files 	= list()
		 
		for entry in os.scandir(mystorepath):
			if entry.is_dir():
				folders.append(entry.path)
			elif entry.is_file():
				files.append(entry.path)
		 
		for f in files:
			#print(f)
			with codecs.open(f, 'r', encoding='utf-8') as fr:
				data3 = fr.read().replace('\n', '').strip()
			fr.close()
			
			#tra		= transformText(h)
			#data2 = beautifyCorpus(data1)
			#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
			data3 = encodeToASCII(data3)
			#data3 = preproc(data3)
			if len(data3) > 50:
				with codecs.open(parentsDB, 'a+', encoding='ascii') as fa:
					#fa.write("parents###"+tra+"\n")
					fa.write("__label__PARENTS"+" "+data3+"\n")
				fa.close()
			

folders = list()
files 	= list()
 
for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/manager/"):
	if entry.is_dir():
		folders.append(entry.path)
	elif entry.is_file():
		files.append(entry.path)
 
for f in files:
	if f.find("en")!= -1:
		with codecs.open(f, 'r', encoding='utf-8') as fr:
			data3 = fr.read().replace('\n', '').strip()
		fr.close()
		
		data3	= parse_html(data3)
		###tra		= transformText(h)
		#data2 = beautifyCorpus(h)
		#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
		data3 = encodeToASCII(data3)
		#data3 = preproc(data3)
		if len(data3) > 50:
			with codecs.open(managerDB, 'a+', encoding='ascii') as fa:
				#fa.write("manager###"+tra+"\n")
				fa.write("__label__MANAGER"+" "+data3+"\n")
			fa.close()


folders = list()
files 	= list()
 
for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/parents/"):
	if entry.is_dir():
		folders.append(entry.path)
	elif entry.is_file():
		files.append(entry.path)
 
for f in files:
	if f.find("en")!= -1:
		with codecs.open(f, 'r', encoding='utf-8') as fr:
			data3 = fr.read().replace('\n', '').strip()
		fr.close()
		
		data3	= parse_html(data3)
		###tra		= transformText(h)
		#data2 = beautifyCorpus(h)
		#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
		data3 = encodeToASCII(data3)
		#data3 = preproc(data3)
		if len(data3) > 50:
			with codecs.open(parentsDB, 'a+', encoding='ascii') as fa:
				#fa.write("parents###"+tra+"\n")
				fa.write("__label__PARENTS"+" "+data3+"\n")
			fa.close()

folders = list()
files 	= list()
 
for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/teenager/"):
	if entry.is_dir():
		folders.append(entry.path)
	elif entry.is_file():
		files.append(entry.path)
 
for f in files:
	if f.find("en")!= -1:
		with codecs.open(f, 'r', encoding='utf-8') as fr:
			data3 = fr.read().replace('\n', '').strip()
		fr.close()
		
		data3	= parse_html(data3)
		###tra		= transformText(h)
		#data2 = beautifyCorpus(h)
		#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
		data3 = encodeToASCII(data3)
		#data3 = preproc(data3)
		if len(data3) > 50:
			with codecs.open(teenagerDB, 'a+', encoding='ascii') as fa:
				#fa.write("teenager###"+tra+"\n")
				fa.write("__label__TEENAGER"+" "+data3+"\n")
			fa.close()

"""

"""
exit(1)
"""

"""
print( p[0][0] ) 
print( type(p[0][0]) )
print()
print()
print(  p[1].tolist() ) 
print( type(p[1].tolist()) )
"""
exit(1)

#fasttext.load_model(self.fasttext_model_file) 
#model.predict("Which baking dish is best to bake a banana bread ?", k=-1, threshold=0.5)
#model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000)
#model.save_model("/home/unaiqueFRAMEWORK/text_classify/fasttext/binary/unaique.ftz")