from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from tabulate import tabulate
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
import logging
import datetime
import pprint
import codecs
import numpy as np
import dill # wichtig für joblib load and dump
import os
import math

import nltk                       # the natural langauage toolkit, open-source NLP
import pandas as pd               # pandas dataframe
import re                         # regular expression
from nltk.corpus import stopwords
from gensim.utils import lemmatize
from gensim import parsing        # Help in preprocessing the data, very efficiently
import gensim
import numpy as np
import pprint
import lxml
import codecs
import os,sys
import gzip
import json
import os

#root@v22020089423124746:/home/seo-auto-scaler/data# python3 extractor_dreizweieins.py && python3 extractor_ecommerce-vision.py && python3 extractor_seokratie.py && python3 extractor_seonative.py && python3 extractor_seotrainee.py

pp = pprint.PrettyPrinter(indent=4)
logging.basicConfig(level=logging.INFO)
np.random.seed(500)

a 			= datetime.datetime.now()

# Save Models
svc_obj 	= "/home/seo-auto-scaler/svm/svc.dill"
tfidf_obj 	= "/home/seo-auto-scaler/svm/tfidf.dill"

#svc_obj 	= "/dev/shm/binarys/svc_large1.dill"
#tfidf_obj 	= "/dev/shm/binarys/tfidf_large1.dill"

# Read Training file
# print(svm.decision_function(X_test)) Prediction is Label:[['manager' 'parents'  'teenager']]
#db_file 	= "/home/unaiqueFRAMEWORK/live_datasets/testset_unaique3.txt"
#db_file		= "/home/unaiqueFRAMEWORK/data/theteenagertoday/155.txt"	# Teenager: Training on new Sample
#db_file		= "/home/unaiqueFRAMEWORK/data/ceo/155.txt"	# Training on new Sample
#db_file		= "/home/unaiqueFRAMEWORK/data/babble/155.txt"	# Training on new Sample
db_file			= "/home/seo-auto-scaler/svm/data.txt"
load_model		= False

print("Processing file:",db_file)
print("Loading Model:", load_model)

#print("Loading pickled TfidfVectorizer!")
#with open(tfidf_obj,'rb') as pf:
#	tf_transformer 	= dill.load(pf)


def transformText(text):
	# https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#stanfordcorenlplemmatization

	stops = set(stopwords.words("german"))

	# Convert text to lower
	text = text.lower()
	# Removing non ASCII chars
	text = re.sub(r'[^\x00-\x7f]',r' ',text)

	# Strip multiple whitespaces
	text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)

	# Removing all the stopwords
	filtered_words = [word for word in text.split() if word not in stops]

	# Removing all the tokens with lesser than 3 characters
	filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)

	# Preprocessed text after stop words removal
	text = " ".join(filtered_words)

	# Remove the punctuation
	text = gensim.parsing.preprocessing.strip_punctuation2(text)

	# Strip all the numerics
	text = gensim.parsing.preprocessing.strip_numeric(text)

	# Strip multiple whitespaces
	text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)

	# Stemming
	#return gensim.parsing.preprocessing.stem_text(text)

	# Lemmatize
	t = [wd.decode('utf-8').split('/')[0] for wd in gensim.utils.lemmatize(text)]
	return str(" ".join(t))

def filter_doc_list_through_topics_single(topics, docs): # string, list
	ref_docs = list()
	d_tup = (topics, docs)
	ref_docs.append(d_tup)
	return ref_docs

def filter_doc_list_through_topics(docs1): # string, list
	"""
	Reads all of the documents and creates a new list of two-tuples
	that contain a single feature entry and the body text, instead of
	a list of topics. It removes all geographic features and only
	retains those documents which have at least one non-geographic
	topic.
	"""
	#docs = docs1.split("\n")
	#print(len(docs1))
	ref_docs = list()
	for t in docs1:
		#print("filter_doc_list_through_topics",d)

		if "__label__VISION" in d:
			t = d.replace("__label__VISION","")
			l = "SEO_VISION"
		if "__label__TRAINEE" in d:
			t = d.replace("__label__TRAINEE","")
			l = "SEO_TRAINEE"
		if "__label__KRATIE" in d:
			t = d.replace("__label__KRATIE","")
			l = "SEO_KRATIE"
		if "__label__321" in d:
			t = d.replace("__label__321","")
			l = "SEO_321"
		if "__label__NATIVE" in d:
			t = d.replace("__label__NATIVE","")
			l = "SEO_NATIVE"

		#try:
		label=l
		text=t.lower()
		#label, text = d.split("__label__TEENAGER")
		d_tup = (label, text)
		ref_docs.append(d_tup)
		#except Exception as e:
		#	1
	return ref_docs


def create_tfidf_training_data(docs):
	"""
	Creates a document corpus list (by stripping out the
	class labels), then applies the TF-IDF transform to this
	list.

	The function returns both the class label vector (y) and
	the corpus token/feature matrix (X).
	"""
	corpus 				= list()
	y 					= list()

	# Create the training data class labels
	y 					= [d[0] for d in docs]

	# Create the document corpus list
	corpus 				= [d[1] for d in docs]

	# Create the TF-IDF vectoriser and transform the corpus
	# ######## https://stackoverflow.com/questions/52150800/valueerror-x-has-1709-features-per-sample-expecting-2444
	#vectorizer 			= TfidfVectorizer(min_df=1, use_idf=True, smooth_idf=True, analyzer='word', ngram_range=(1,2), stop_words="english", lowercase=True)
	vectorizer 			= TfidfVectorizer(min_df=1, use_idf=True, smooth_idf=True)
	tf_transformer 		= vectorizer.fit(corpus)
	X 					= vectorizer.fit_transform(corpus) # copy=True bei load_model = True y=None bei load_model False
	#vectorizer 	= TfidfVectorizer()
	#X 			= vectorizer.transform(corpus, copy=True) # copy=True bei load_model = True y=None bei load_model False
	#print(len(vectorizer.vocabulary_))

	if not load_model:
		#os.unlink(svc_obj)
		print("Pickling TfidfVectorizer!")
		with open(tfidf_obj,'wb') as pf:
			dill.dump(tf_transformer, pf)
		pf.close()

	return X, y

def create_tfidf_training_data_load(docs):
	"""
	Creates a document corpus list (by stripping out the
	class labels), then applies the TF-IDF transform to this
	list.

	The function returns both the class label vector (y) and
	the corpus token/feature matrix (X).
	"""
	corpus 				= list()
	y 					= list()

	# Create the training data class labels
	y 					= [d[0] for d in docs]

	#print(y)
	#print(len(y))
	#print(type(y))

	# Create the document corpus list
	corpus 				= [d[1] for d in docs]

	#print(corpus)
	#print(len(corpus))
	#print(type(corpus))

	# Create the TF-IDF vectoriser and transform the corpus
	# ######## https://stackoverflow.com/questions/52150800/valueerror-x-has-1709-features-per-sample-expecting-2444

	"""
		HACK für cross_val_score(): START

	ay		= list()
	acorpus = list()
	for a in range(0,2):
		ay.append(y)
		acorpus.append(corpus)

	y		= list()
	corpus	= list()

	y		= ay
	corpus	= acorpus

		HACK für cross_val_score(): ENDE
	"""

	"""
	print("Loading pickled TfidfVectorizer!")
	with open(tfidf_obj,'rb') as pf:
		tf_transformer 	= dill.load(pf)
	"""

	vectorizer 			= TfidfVectorizer(min_df=1, use_idf=True, smooth_idf=True, vocabulary=tf_transformer.vocabulary_)
	#vectorizer 			= TfidfVectorizer(min_df=1, use_idf=True, smooth_idf=True, analyzer='word', ngram_range=(1,2), stop_words="english", lowercase=True, vocabulary=tf_transformer.vocabulary_)
	X 					= vectorizer.fit_transform(corpus) # copy=True bei load_model = True y=None bei load_model False

	return X, y

def train_svm(X, y):
	"""
	Create and train the Support Vector Machine.
	#svm = SVC(C=1000000.0, gamma='auto', cache_size=512, kernel='rbf', probability=True)
	# https://stackoverflow.com/questions/31681373/making-svm-run-faster-in-python
	"""
	"""
	svm 			= SVC()	#OneVsRestClassifier(SVC()) 'n_jobs':[-1]  ‘sigmoid
	parameters 	= {'kernel':('linear', 'rbf', 'poly', 'sigmoid'), 'C':(1,0.25,0.5,0.75),'gamma': (1,2,3,'auto'),'decision_function_shape':('ovo','ovr'),'shrinking':(True,False)}
	#parameters 		= {'kernel': ('linear','poly'), 'cache_size':(256, 512), 'C':(0.5, 0.75), 'gamma': (3,'auto'),'decision_function_shape': ('ovo','ovr'), 'shrinking': (True,False)}
	clf 		= GridSearchCV(svm, parameters)
	clf.fit(X, y)
	return clf
	"""

	svm 			= SVC(C=1.0, gamma='scale', cache_size=512, kernel='poly', degree=3, verbose=3, max_iter=1000, class_weight='balanced', probability=True)
	#svm 			= LinearSVC(verbose=0, C=1.0, max_iter=3000, class_weight='balanced')
	svm.fit(X, y)

	if not load_model:
		#os.unlink(svc_obj)
		print()
		print("Pickling LinearSVC!")
		with open(svc_obj,'wb') as pf:
			dill.dump(svm, pf)
		pf.close()

	#joblib.dump(model, filename)
	return svm

if __name__ == "__main__":
	#labels: manager, teenager, parents

	"""
	Die beiden in if loadmodel versionen mal gegeneinander parellel laufen lassen
	"""

	# Open one result for processing
	with codecs.open(db_file, 'r', encoding='utf-8') as fr:
		#text 		= fr.read().replace('\n', '').strip()
		#text 		= fr.read()
		text 		= fr.read().split('\n')
	fr.close()
	print("Read File complete!")

	#docs			= transformText(text) # für t_manager.txt auskommentiert
	docs			= text
	print("File Preprocessing Sample complete!")

	# Obtain the topic tags and filter docs through it
	ref_docs 		= filter_doc_list_through_topics(docs)
	print("Filter File complete!",len(ref_docs))

	# Vectorise and TF-IDF transform the corpus
	X, y 			= create_tfidf_training_data(ref_docs)
	print("Tf-IDF File complete!")

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)
	#X_train, X_test, y_train, y_test = X, X, y, y
	print("Train_Test File complete!")

	# Create and train the Support Vector Machine
	svm 			= train_svm(X_train, y_train)
	print("SVC File complete!")

	"""
	with open(svc_obj,'rb') as pf:
		svm			= dill.load(pf)
	pf.close()
	print("SVC File complete!")
	"""

	"""
	# Make an array of predictions on the test set
	pred_label		= svm.predict(X_test)
	pred_label_str 	= str("\n".join(pred_label))
	#print("Prediction:", len(pred_label))
	print("Prediction is Label:", pred_label_str)

	# Output the hit-rate and the confusion matrix for each model
	print(svm.score(X_test, y_test))
	"""
	"""
	dfunct_val 	= float(0)
	dfunct 		= svm.decision_function(X_test).tolist()[0]	# numpy array vernested eine liste in eine weitere python liste drum nehme das erste element

	prob_list	= list()
	for d in dfunct:
		sample_probability = 1 / (1 +  math.exp(0 - d))
		prob_list.append(sample_probability)
		#print("Sample probability score:"+str(sample_probability))

	prob_list.sort()	# hole das höhste zahlenelement und gibt es via format() aus
	print("Sample probability: {0}".format(prob_list[-1]))
	"""
	b 			= datetime.datetime.now()
	delta 		= b - a
	print("Process Runtime:", delta)