from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import model_selection, naive_bayes, svm from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.multiclass import OneVsRestClassifier from tabulate import tabulate from sklearn.svm import SVC from sklearn.svm import LinearSVC import logging import datetime import pprint import codecs import numpy as np import dill # wichtig für joblib load and dump import os import math import nltk # the natural langauage toolkit, open-source NLP import pandas as pd # pandas dataframe import re # regular expression from nltk.corpus import stopwords from gensim.utils import lemmatize from gensim import parsing # Help in preprocessing the data, very efficiently import gensim import numpy as np import pprint import lxml import codecs import os,sys import gzip import json import os #root@v22020089423124746:/home/seo-auto-scaler/data# python3 extractor_dreizweieins.py && python3 extractor_ecommerce-vision.py && python3 extractor_seokratie.py && python3 extractor_seonative.py && python3 extractor_seotrainee.py pp = pprint.PrettyPrinter(indent=4) logging.basicConfig(level=logging.INFO) np.random.seed(500) a = datetime.datetime.now() # Save Models svc_obj = "/home/seo-auto-scaler/svm/svc.dill" tfidf_obj = "/home/seo-auto-scaler/svm/tfidf.dill" #svc_obj = "/dev/shm/binarys/svc_large1.dill" #tfidf_obj = "/dev/shm/binarys/tfidf_large1.dill" # Read Training file # print(svm.decision_function(X_test)) Prediction is Label:[['manager' 'parents' 'teenager']] #db_file = "/home/unaiqueFRAMEWORK/live_datasets/testset_unaique3.txt" #db_file = "/home/unaiqueFRAMEWORK/data/theteenagertoday/155.txt" # Teenager: Training on new Sample #db_file = "/home/unaiqueFRAMEWORK/data/ceo/155.txt" # Training on new Sample #db_file = "/home/unaiqueFRAMEWORK/data/babble/155.txt" # Training on new Sample db_file = "/home/seo-auto-scaler/svm/data.txt" load_model = False print("Processing file:",db_file) print("Loading Model:", load_model) #print("Loading pickled TfidfVectorizer!") #with open(tfidf_obj,'rb') as pf: # tf_transformer = dill.load(pf) def transformText(text): # https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#stanfordcorenlplemmatization stops = set(stopwords.words("german")) # Convert text to lower text = text.lower() # Removing non ASCII chars text = re.sub(r'[^\x00-\x7f]',r' ',text) # Strip multiple whitespaces text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) # Removing all the stopwords filtered_words = [word for word in text.split() if word not in stops] # Removing all the tokens with lesser than 3 characters filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3) # Preprocessed text after stop words removal text = " ".join(filtered_words) # Remove the punctuation text = gensim.parsing.preprocessing.strip_punctuation2(text) # Strip all the numerics text = gensim.parsing.preprocessing.strip_numeric(text) # Strip multiple whitespaces text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) # Stemming #return gensim.parsing.preprocessing.stem_text(text) # Lemmatize t = [wd.decode('utf-8').split('/')[0] for wd in gensim.utils.lemmatize(text)] return str(" ".join(t)) def filter_doc_list_through_topics_single(topics, docs): # string, list ref_docs = list() d_tup = (topics, docs) ref_docs.append(d_tup) return ref_docs def filter_doc_list_through_topics(docs1): # string, list """ Reads all of the documents and creates a new list of two-tuples that contain a single feature entry and the body text, instead of a list of topics. It removes all geographic features and only retains those documents which have at least one non-geographic topic. """ #docs = docs1.split("\n") #print(len(docs1)) ref_docs = list() for t in docs1: #print("filter_doc_list_through_topics",d) if "__label__VISION" in d: t = d.replace("__label__VISION","") l = "SEO_VISION" if "__label__TRAINEE" in d: t = d.replace("__label__TRAINEE","") l = "SEO_TRAINEE" if "__label__KRATIE" in d: t = d.replace("__label__KRATIE","") l = "SEO_KRATIE" if "__label__321" in d: t = d.replace("__label__321","") l = "SEO_321" if "__label__NATIVE" in d: t = d.replace("__label__NATIVE","") l = "SEO_NATIVE" #try: label=l text=t.lower() #label, text = d.split("__label__TEENAGER") d_tup = (label, text) ref_docs.append(d_tup) #except Exception as e: # 1 return ref_docs def create_tfidf_training_data(docs): """ Creates a document corpus list (by stripping out the class labels), then applies the TF-IDF transform to this list. The function returns both the class label vector (y) and the corpus token/feature matrix (X). """ corpus = list() y = list() # Create the training data class labels y = [d[0] for d in docs] # Create the document corpus list corpus = [d[1] for d in docs] # Create the TF-IDF vectoriser and transform the corpus # ######## https://stackoverflow.com/questions/52150800/valueerror-x-has-1709-features-per-sample-expecting-2444 #vectorizer = TfidfVectorizer(min_df=1, use_idf=True, smooth_idf=True, analyzer='word', ngram_range=(1,2), stop_words="english", lowercase=True) vectorizer = TfidfVectorizer(min_df=1, use_idf=True, smooth_idf=True) tf_transformer = vectorizer.fit(corpus) X = vectorizer.fit_transform(corpus) # copy=True bei load_model = True y=None bei load_model False #vectorizer = TfidfVectorizer() #X = vectorizer.transform(corpus, copy=True) # copy=True bei load_model = True y=None bei load_model False #print(len(vectorizer.vocabulary_)) if not load_model: #os.unlink(svc_obj) print("Pickling TfidfVectorizer!") with open(tfidf_obj,'wb') as pf: dill.dump(tf_transformer, pf) pf.close() return X, y def create_tfidf_training_data_load(docs): """ Creates a document corpus list (by stripping out the class labels), then applies the TF-IDF transform to this list. The function returns both the class label vector (y) and the corpus token/feature matrix (X). """ corpus = list() y = list() # Create the training data class labels y = [d[0] for d in docs] #print(y) #print(len(y)) #print(type(y)) # Create the document corpus list corpus = [d[1] for d in docs] #print(corpus) #print(len(corpus)) #print(type(corpus)) # Create the TF-IDF vectoriser and transform the corpus # ######## https://stackoverflow.com/questions/52150800/valueerror-x-has-1709-features-per-sample-expecting-2444 """ HACK für cross_val_score(): START ay = list() acorpus = list() for a in range(0,2): ay.append(y) acorpus.append(corpus) y = list() corpus = list() y = ay corpus = acorpus HACK für cross_val_score(): ENDE """ """ print("Loading pickled TfidfVectorizer!") with open(tfidf_obj,'rb') as pf: tf_transformer = dill.load(pf) """ vectorizer = TfidfVectorizer(min_df=1, use_idf=True, smooth_idf=True, vocabulary=tf_transformer.vocabulary_) #vectorizer = TfidfVectorizer(min_df=1, use_idf=True, smooth_idf=True, analyzer='word', ngram_range=(1,2), stop_words="english", lowercase=True, vocabulary=tf_transformer.vocabulary_) X = vectorizer.fit_transform(corpus) # copy=True bei load_model = True y=None bei load_model False return X, y def train_svm(X, y): """ Create and train the Support Vector Machine. #svm = SVC(C=1000000.0, gamma='auto', cache_size=512, kernel='rbf', probability=True) # https://stackoverflow.com/questions/31681373/making-svm-run-faster-in-python """ """ svm = SVC() #OneVsRestClassifier(SVC()) 'n_jobs':[-1] ‘sigmoid parameters = {'kernel':('linear', 'rbf', 'poly', 'sigmoid'), 'C':(1,0.25,0.5,0.75),'gamma': (1,2,3,'auto'),'decision_function_shape':('ovo','ovr'),'shrinking':(True,False)} #parameters = {'kernel': ('linear','poly'), 'cache_size':(256, 512), 'C':(0.5, 0.75), 'gamma': (3,'auto'),'decision_function_shape': ('ovo','ovr'), 'shrinking': (True,False)} clf = GridSearchCV(svm, parameters) clf.fit(X, y) return clf """ svm = SVC(C=1.0, gamma='scale', cache_size=512, kernel='poly', degree=3, verbose=3, max_iter=1000, class_weight='balanced', probability=True) #svm = LinearSVC(verbose=0, C=1.0, max_iter=3000, class_weight='balanced') svm.fit(X, y) if not load_model: #os.unlink(svc_obj) print() print("Pickling LinearSVC!") with open(svc_obj,'wb') as pf: dill.dump(svm, pf) pf.close() #joblib.dump(model, filename) return svm if __name__ == "__main__": #labels: manager, teenager, parents """ Die beiden in if loadmodel versionen mal gegeneinander parellel laufen lassen """ # Open one result for processing with codecs.open(db_file, 'r', encoding='utf-8') as fr: #text = fr.read().replace('\n', '').strip() #text = fr.read() text = fr.read().split('\n') fr.close() print("Read File complete!") #docs = transformText(text) # für t_manager.txt auskommentiert docs = text print("File Preprocessing Sample complete!") # Obtain the topic tags and filter docs through it ref_docs = filter_doc_list_through_topics(docs) print("Filter File complete!",len(ref_docs)) # Vectorise and TF-IDF transform the corpus X, y = create_tfidf_training_data(ref_docs) print("Tf-IDF File complete!") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True) #X_train, X_test, y_train, y_test = X, X, y, y print("Train_Test File complete!") # Create and train the Support Vector Machine svm = train_svm(X_train, y_train) print("SVC File complete!") """ with open(svc_obj,'rb') as pf: svm = dill.load(pf) pf.close() print("SVC File complete!") """ """ # Make an array of predictions on the test set pred_label = svm.predict(X_test) pred_label_str = str("\n".join(pred_label)) #print("Prediction:", len(pred_label)) print("Prediction is Label:", pred_label_str) # Output the hit-rate and the confusion matrix for each model print(svm.score(X_test, y_test)) """ """ dfunct_val = float(0) dfunct = svm.decision_function(X_test).tolist()[0] # numpy array vernested eine liste in eine weitere python liste drum nehme das erste element prob_list = list() for d in dfunct: sample_probability = 1 / (1 + math.exp(0 - d)) prob_list.append(sample_probability) #print("Sample probability score:"+str(sample_probability)) prob_list.sort() # hole das höhste zahlenelement und gibt es via format() aus print("Sample probability: {0}".format(prob_list[-1])) """ b = datetime.datetime.now() delta = b - a print("Process Runtime:", delta)