# read in some helpful libraries
import nltk                       # the natural langauage toolkit, open-source NLP
import pandas as pd               # pandas dataframe
import re                         # regular expression
from nltk.corpus import stopwords  
from gensim.utils import lemmatize
from gensim import parsing        # Help in preprocessing the data, very efficiently
import gensim
import numpy as np
import pprint
import lxml
import codecs
import os,sys
import gzip
import json
import os

import unicodedata
from multiprocessing.pool import ThreadPool
from multiprocessing import Pool
import concurrent.futures
#import cdx_toolkit
import pprint
import lxml
import codecs
import os,sys
from pathlib import Path
import gzip
import random
import json
import math
import uuid
import warcio				# pip3 install --upgrade warcio
from warcio.archiveiterator import ArchiveIterator
import builtins
vars(globals()['__builtins__']) is vars(builtins)
from io import BytesIO, StringIO
pp = pprint.PrettyPrinter(indent=4)
import justext	# pip install justext
#import urllib.request
import requests	# pip3 install --upgrade requests
#from urllib.parse import urlparse# pip3 install --upgrade urlparse
#import urllib # url=urllib.unquote(url).decode('utf8')
import urllib3
urllib3.disable_warnings()
#urllib3.disable_warnings()
import signal
from contextlib import contextmanager
from bs4 import BeautifulSoup, Comment
import bleach #https://pypi.org/project/bleach/

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from tabulate import tabulate
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
import logging
import datetime
import pprint
import codecs
import numpy as np
import dill # wichtig für joblib load and dump
import os
import io
from langdetect import detect	# also https://github.com/saffsd/langid.py

import nltk                       # the natural langauage toolkit, open-source NLP
import pandas as pd               # pandas dataframe
import re                         # regular expression
from time import time as timer
from nltk.corpus import stopwords  
from gensim.utils import lemmatize
from gensim import parsing        # Help in preprocessing the data, very efficiently
import gensim
import numpy as np
import pprint
import lxml
import codecs
import os,sys
import gzip
import json
import os
import langid
import multiprocessing as mpl

import spacy  # See "Installing spaCy"
nlp_de				= spacy.load('de_core_news_sm')
nlp_en 				= spacy.load('en_core_web_sm')
nlp_de.max_length 	= 10000000
nlp_en.max_length 	= 10000000

def _is_wordlike(tok):
	return tok.orth_ and tok.orth_[0].isalpha()

def sentence_division_suppresor(doc):
	"""Spacy pipeline component that prohibits sentence segmentation between two tokens that start with a letter.
	Useful for taming overzealous sentence segmentation in German model, possibly others as well."""
	for i, tok in enumerate(doc[:-1]):
		if _is_wordlike(tok) and _is_wordlike(doc[i + 1]):
			doc[i + 1].is_sent_start = False
	return doc

nlp_de.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser')
nlp_en.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser')

"""
500MB Quelldatei zum Training:
350 MB Manager
150 MB Teenager
dann mittels colab 3h trainieren und die ergebnisse in google drive speichern
"""
# Sample random elements from file: perl -ne 'print if (rand() < .1)' manager_train.txt > manager_training.txt
# sample random -> https://unix.stackexchange.com/questions/108581/how-to-randomly-sample-a-subset-of-a-file

#cat teenink/* teenreads/* girlscouts/* > kids.txt
#cat alphamom/* babble/* > parents.txt
#cat bloomberg/* ceo/* chiefexecutive/* cio/* forbes/* foreignaffairs/* hbr/* inc/* managementexchange/* n2growth/* real-leaders/* strategy-business/* > manager.txt
# cat bloomberg/1* >> manager.txt

#https://www.kaggle.com/arunava21/word2vec-and-random-forest-classification

storepathData= "/home/unaiqueFRAMEWORK/data"
storepath 	= "/home/unaiqueFRAMEWORK/text_classify/training_data"
#managerDB	= storepath+"/gpt2_manager_train.txt"
managerDB	= "/home/unaiqueFRAMEWORK/new_prototyp/training/data/manager_train.txt"

manager 	= {"bloomberg":'bloomberg.com', "cio":'cio.com', "hbr":'hbr.org', "forbes":'forbes.com', 'foreignaffairs':'foreignaffairs.com', 'ceo':'ceo.com','chiefexecutive':'chiefexecutive.net', 'economist':'economist.com', 'strategy-business':'strategy-business.com','managementexchange':'managementexchange.com', 'real-leaders':'real-leaders.com', 'inc':'inc.com', 'n2growth':'n2growth.com', "wsj" : 'wsj.com'}

#https://stackoverflow.com/questions/12221387/how-to-extract-the-first-x-megabyte-from-a-large-file-in-unix-linux

print(len(manager))
print()
print(150/len(manager))		# 145 MB sind der gesamte Teenager Text
#exit(1)

def beautifyCorpus(text):
	return text
"""
	rList				= list()
	nlp_de.max_length 	= len(text) + 1
	doc 				= nlp_de(text)
	for sent in doc.sents:
		s				= str(sent)
		flag			= isDublicateString(s)
		flag2			= isGoodCasing(s)
		if not flag and flag2:
			rList.append(s)
			
	return " ".join(rList)
"""
	
def isDublicateString(text):
	t_Text 			= text.lower()
	#debug 			= True
	patternElement 	= ""
	n				= 16# neun zeichen müssen gleich sein
       
	if len(t_Text) >= n+3:
		patternElement = t_Text[:n]
		patternElement = patternElement.strip()
		#print("PatternElement:", patternElement)
		r_Count = t_Text.count(patternElement)
		if r_Count > 1:
			#if debug:
			#	print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", patternElement)
			return True
	
	#satz = "Breath of the WildThe Legend of Zelda: Breath of the Wild auf der E3 angespielt."
	#['breath of th', 'e wildthe le', 'gend of zeld', 'a: breath of', ' the wild au', 'f der e3 ang', 'espielt.']
	t_List = [t_Text[i:i+n] for i in range(0, len(t_Text), n)]
	for t in t_List:
		t = t.strip()
		r_Count = t_Text.count(t)
		if r_Count > 1:
			#if debug:
			#	print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", t)
			return True
	
	return False

def isGoodCasing(text):
	if not isinstance(text, str):
		return False
	
	strString 	= text #str(text,'utf-8')
	i=0
	try:
		for c in strString:
			if c in [u'ß',u'ö',u'ä',u'ü',u'Ö',u'Ä',u'Ü']:
				1
			elif c != " " and len(strString) >=i+1:
				if c.islower() and strString[i+1].isupper():
					# zuviel positive falses
					#print("sentify.isGoodCasing(): -> aA Error")
					#aA
					return False
				if c.isdigit() and (strString[i+1].isupper() and strString[i+1].isalpha()):
					#if debug:
					#	print("sentify.isGoodCasing(): -> 2P Error")
					return False
				elif c.isdigit() and strString[i+1].isalpha():
					# to much positive falses
					#print("sentify.isGoodCasing(): -> 2a Error")
					#2a -> Achtung, eventuell bei Adressen als Ausgabe gibt es Probleme
					1# return False
				elif c.isdigit() and strString[i+1].isdigit():
					1 # 11
				elif c.isalpha() and strString[i+1].isdigit():
					#if debug:
					#	print("sentify.isGoodCasing(): -> a2 Error")
					# a2
					return False
				elif c.isdigit() and strString[i+1] not in ['.','!','?','"','-','\'', ':',' ']:
					#if debug:
					#	print("sentify.isGoodCasing -> text.lower()/upper() Error:")
					return False
		i=i+1
	except:
		1
	
	return True

def strip_accents(text):
	return "".join(char for char in unicodedata.normalize('NFKD', text) if unicodedata.category(char) != 'Mn')

def parse_html(page):
	""" Clean HTML tags for webpages that aren't Gutenberg books """
	try:
		# https://github.com/miso-belica/jusText/tree/dev/justext/stoplists
		parts = justext.justext(page, justext.get_stoplist('English'))
		#parts = justext.justext(page, justext.get_stoplist('German'))
		#print(parts)
		paragraphs = list()
		for part in parts:
			#pp.pprint(part.is_boilerplate)
			if not part.is_boilerplate:
				#pp.pprint(part)
				paragraphs.append(part.text)
		return str('\n\n'.join(paragraphs))
	
	except Exception as e:
		#	exc_type, exc_obj, exc_tb = sys.exc_info()
		#	fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
		#	print("Unexpected error: parse_html Error", sys.exc_info()[0])
		#	print(exc_type, fname, exc_tb.tb_lineno)
		#	print()
		try:
			soup 			= BeautifulSoup(page, "html.parser") 				#"html.parser" "lxml")
			#comments 		= soup.findAll(text=lambda text:isinstance(text, Comment)) 	# remove comments
			#[comment.extract() for comment in comments]
			for script in soup.findAll(["script", "style", 'footer', 'head']):
				#script.extract()		# rip it out
				script.decompose()		# rip it out
			#[x.extract() for x in soup.findAll(['script', 'style'])]
			#[x.decompose() for x in soup.findAll(['script', 'style'])]
			soup.prettify()
			myText 			= soup.get_text()
			plaintextv1		= bleach.clean(myText, strip=True, strip_comments=True)
			plaintext		= re.sub(r'<.*?>', '', plaintextv1)
			#plaintext		= plaintext.replace('\n', ' ')
			#plaintext		= plaintext.replace("\n", ' ')
			#plaintext		= textify.removeDomainsSimple(plaintext)
			
			return plaintext#.strip()		
		except (ValueError, KeyError, TypeError, Exception):
			#print("Unexpected error:", sys.exc_info()[0])
			exc_type, exc_obj, exc_tb = sys.exc_info()
			fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
			#print(exc_type, fname, exc_tb.tb_lineno)
		
	return str("")

def encodeToASCII(text):
	encResults  = text.encode('utf-8', "ignore")
	s_string	= str(encResults.decode('ascii', "ignore"))
	#textv1 		= re_pattern.sub(u'\uFFFD', s_string)
	return s_string

def encodeToLatin1(text):
	#text 		= text.replace('ß','ss')
	encResults  = text.encode('utf-8', "ignore")
	#encResults = text.encode('utf-8', "ignore")
	s_string	= str(encResults.decode('latin-1', "ignore"))
	#textv1 		= re_pattern.sub(u'\uFFFD', s_string)
	return s_string

def encodeToUTF8Adv(text):
	encResults 	= text.encode('utf-8', "ignore")
	#return str(encResults.decode('latin-1', "ignore"))
	s_string	= str(encResults.decode('utf-8', "remove"))
	#textv1 		= re_pattern.sub(u'\uFFFD', s_string)
	return s_string

def encodeToUTF8(text):
	return text.encode('utf-8', "ignore")

def transformText(text):
	# https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#stanfordcorenlplemmatization

	stops = set(stopwords.words("english"))

	# Convert text to lower
	text = text.lower()
	# Removing non ASCII chars    
	text = re.sub(r'[^\x00-\x7f]',r' ',text)

	# Strip multiple whitespaces
	text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)

	# Removing all the stopwords
	filtered_words = [word for word in text.split() if word not in stops]

	# Removing all the tokens with lesser than 3 characters
	filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)

	# Preprocessed text after stop words removal
	text = " ".join(filtered_words)

	# Remove the punctuation
	text = gensim.parsing.preprocessing.strip_punctuation2(text)

	# Strip all the numerics
	text = gensim.parsing.preprocessing.strip_numeric(text)

	# Strip multiple whitespaces
	text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
		
	# Stemming
	#return gensim.parsing.preprocessing.stem_text(text)

	# Lemmatize
	t = [wd.decode('utf-8').split('/')[0] for wd in gensim.utils.lemmatize(text)]
	return str(" ".join(t))
	
"""
	MAIN 
	https://minimaxir.com/2019/09/howto-gpt2/
"""

for key in manager:
	print ("Walking1: ",key)				# key
	print ("Walking2: ",manager[key])	# value
	tmp_string = str("")
	mystorepath = storepathData+"/"+key
	if os.path.exists(mystorepath):
		folders = list()
		files 	= list()
		 
		for entry in os.scandir(mystorepath):
			if entry.is_dir():
				folders.append(entry.path)
			elif entry.is_file():
				files.append(entry.path)
		 
		for f in files:
			#if len(tmp_string) < 6500000:
			#print(f)
			with codecs.open(f, 'r', encoding='utf-8') as fr:
				data1 = fr.read().replace('\n', '').strip()
				#data1 = fr.readlines()#.replace('\n', '').strip()
			fr.close()
						
			#for d in data1:
			#data2 = beautifyCorpus(data1)
			#data2 = transformText(data1)
			#data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2)
			#data3 = encodeToASCII(data3)
			#data3 = data3.replace('\n', '').strip().lower()
			with codecs.open(managerDB, 'a+', encoding='utf8') as fa:
				#fa.write("manager###"+tra+"\n")
				#fa.write(data3+"\n")
				fa.write(data1+"\n")
			fa.close()
	
exit(1)