# -*- coding: utf-8 -*- #!/usr/bin/python3.6 -S -W ignore from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import model_selection, naive_bayes, svm from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.multiclass import OneVsRestClassifier from tabulate import tabulate from sklearn.svm import SVC from sklearn.svm import LinearSVC import logging import datetime import pprint import codecs import numpy as np import dill # wichtig für joblib load and dump import os import math import nltk # the natural langauage toolkit, open-source NLP import pandas as pd # pandas dataframe import re # regular expression from nltk.corpus import stopwords from gensim.utils import lemmatize from gensim import parsing # Help in preprocessing the data, very efficiently import gensim import numpy as np import pprint import lxml import codecs import os,sys import gzip import json import os import os import codecs import os,sys import gzip import json import re import subprocess from fasttext import train_supervised import fasttext import pprint pp = pprint.PrettyPrinter(indent=4) import spacy # See "Installing spaCy" nlp_de = spacy.load('de_core_news_sm') nlp_en = spacy.load('en_core_web_sm') nlp_de.max_length = 10000000 nlp_en.max_length = 10000000 def _is_wordlike(tok): return tok.orth_ and tok.orth_[0].isalpha() def sentence_division_suppresor(doc): """Spacy pipeline component that prohibits sentence segmentation between two tokens that start with a letter. Useful for taming overzealous sentence segmentation in German model, possibly others as well.""" for i, tok in enumerate(doc[:-1]): if _is_wordlike(tok) and _is_wordlike(doc[i + 1]): doc[i + 1].is_sent_start = False return doc nlp_de.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser') nlp_en.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser') def transformText(text): # https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#stanfordcorenlplemmatization stops = set(stopwords.words("english")) # Convert text to lower text = text.lower() # Removing non ASCII chars text = re.sub(r'[^\x00-\x7f]',r' ',text) # Strip multiple whitespaces text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) # Removing all the stopwords filtered_words = [word for word in text.split() if word not in stops] # Removing all the tokens with lesser than 3 characters filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3) # Preprocessed text after stop words removal text = " ".join(filtered_words) # Remove the punctuation text = gensim.parsing.preprocessing.strip_punctuation2(text) # Strip all the numerics text = gensim.parsing.preprocessing.strip_numeric(text) # Strip multiple whitespaces text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) # Stemming #return gensim.parsing.preprocessing.stem_text(text) # Lemmatize t = [wd.decode('utf-8').split('/')[0] for wd in gensim.utils.lemmatize(text)] return str(" ".join(t)) # cat cooking.stackexchange.txt | sed -e "s/([.\!?,'/()])/ 1 /g" | tr "[:upper:]" "[:lower:]" > cooking.preprocessed.txt storepathData = "/home/unaiqueFRAMEWORK/data" storepath = "/home/unaiqueFRAMEWORK/text_classify/fasttext/training_data" managerDB = storepath+"/manager_fasttext_train.txt" parentsDB = storepath+"/parents_fasttext_train.txt" teenagerDB = storepath+"/teenager_fasttext_train.txt" manager = {"bloomberg":'bloomberg.com', "cio":'cio.com', "hbr":'hbr.org', "forbes":'forbes.com', 'foreignaffairs':'foreignaffairs.com', 'ceo':'ceo.com','chiefexecutive':'chiefexecutive.net', 'economist':'economist.com', 'strategy-business':'strategy-business.com','managementexchange':'managementexchange.com', 'real-leaders':'real-leaders.com', 'inc':'inc.com', 'n2growth':'n2growth.com', "wsj" : 'wsj.com'} parents = {"babble" : 'babble.com', "alphamom":'alphamom.com'} teenager = {"teenink" : 'teenink.com', 'teenreads':'teenreads.com','girlscouts':'girlscouts.org','teenreads':'teenreads.com','teenkidsnews.com':'teenkidsnews','teensource':'teensource.org','teenvogue':'teenvogue.com','teenspeak':'teenspeak.org','teensgotcents':'teensgotcents.com','theteenagertoday':'theteenagertoday.com'} #https://gist.github.com/bittlingmayer/7139a6a75ba0dbbc3a06325394ae3a13 SUBEXES = ["s/’/'/g", "s/′/'/g", "s/''/ /g", "s/'/ ' /g", 's/“/"/g', 's/”/"/g', 's/"/ /g', "s/\\./ \\. /g", "s/
/ /g", "s/, / , /g", "s/(/ ( /g", "s/)/ ) /g", "s/\\!/ \\! /g", "s/\\?/ \\? /g", "s/\\;/ /g", "s/\\:/ /g", "s/-/ - /g", "s/=/ /g", "s/=/ /g", "s/*/ /g", "s/|/ /g", "s/«/ /g"] model = fasttext.load_model("/home/unaiqueFRAMEWORK/new_prototyp/fasttext/binary/unaique_fullTest2.bin") db_file = "/home/unaiqueFRAMEWORK/new_prototyp/trainingsdata/train/manager_ab" # Training on new Sample csv_file = "/home/unaiqueFRAMEWORK/new_prototyp/fasttext/testing_manager.csv" # Training on new Sample result_file = "/home/unaiqueFRAMEWORK/new_prototyp/fasttext/result_30_manager.txt" result_data_file = "/home/unaiqueFRAMEWORK/new_prototyp/fasttext/result_30_manager_data.txt" current = "manager" w=codecs.open(csv_file, 'w', encoding='utf-8') w.write("is_label;predict_labal;is_probability;is_entry\n") # Open one result for processing with codecs.open(db_file, 'r', encoding='ascii') as fr: #text = fr.read().replace('\n', '').strip().lower() text = fr.read().lower().split('\n') #text = fr.readline() fr.close() print("Read File complete! Entries:", len(text)) #exit(1) count = 0 for t in text: #print(t) docs = transformText(t) # für t_manager.txt auskommentiert #t = t.replace('\n', '').strip().lower() #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", t) #try: p = model.predict(docs, k=1, threshold=0.0) label = str(p[0][0]) probability = str(float(p[1].tolist()[0])) print("Entries:", len(text)) print("Predicted:",label) print("Probability:",probability) print("Current:",current) print("Entry:",count) print("##########################") #w.write("is_label;predict_labal;is_probability;is_entry") count = count + 1 w.write(current+";"+label+";"+probability+";"+str(count)+"\n") with codecs.open(result_file, 'a+', encoding='utf-8') as fa: fa.write(label+";"+str(probability)+"\n") fa.close() if label == "__label__TEENAGER": with codecs.open(result_data_file, 'a+', encoding='utf-8') as fa: fa.write(str(t)+"\n\n\n") fa.close() #except Exception as err2: # 1 w.close() exit(1) def __normalize_text(s): for subex in SUBEXES: s = subprocess.check_output(['sed', subex], input=s.encode("latin-1")).decode("utf-8") # s = subprocess.check_output(['sed', subex], input=s.encode("iso-8859-1")).decode("utf-8") return s def __spaces(s): return ' '.join(s.split()) def __digits(s): return ''.join(filter(lambda c: not c.isdigit(), s)) def preproc(s): return __digits(__spaces(s.lower())) def beautifyCorpus(text): rList = list() nlp_de.max_length = len(text) + 1 doc = nlp_de(text) for sent in doc.sents: s = str(sent) flag = isDublicateString(s) flag2 = isGoodCasing(s) if not flag and flag2: rList.append(s) return " ".join(rList) def isDublicateString(text): t_Text = text.lower() #debug = True patternElement = "" n = 16# neun zeichen müssen gleich sein if len(t_Text) >= n+3: patternElement = t_Text[:n] patternElement = patternElement.strip() #print("PatternElement:", patternElement) r_Count = t_Text.count(patternElement) if r_Count > 1: #if debug: # print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", patternElement) return True #satz = "Breath of the WildThe Legend of Zelda: Breath of the Wild auf der E3 angespielt." #['breath of th', 'e wildthe le', 'gend of zeld', 'a: breath of', ' the wild au', 'f der e3 ang', 'espielt.'] t_List = [t_Text[i:i+n] for i in range(0, len(t_Text), n)] for t in t_List: t = t.strip() r_Count = t_Text.count(t) if r_Count > 1: #if debug: # print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", t) return True return False def isGoodCasing(text): if not isinstance(text, str): return False strString = text #str(text,'utf-8') i=0 try: for c in strString: if c in [u'ß',u'ö',u'ä',u'ü',u'Ö',u'Ä',u'Ü']: 1 elif c != " " and len(strString) >=i+1: if c.islower() and strString[i+1].isupper(): # zuviel positive falses #print("sentify.isGoodCasing(): -> aA Error") #aA return False if c.isdigit() and (strString[i+1].isupper() and strString[i+1].isalpha()): #if debug: # print("sentify.isGoodCasing(): -> 2P Error") return False elif c.isdigit() and strString[i+1].isalpha(): # to much positive falses #print("sentify.isGoodCasing(): -> 2a Error") #2a -> Achtung, eventuell bei Adressen als Ausgabe gibt es Probleme 1# return False elif c.isdigit() and strString[i+1].isdigit(): 1 # 11 elif c.isalpha() and strString[i+1].isdigit(): #if debug: # print("sentify.isGoodCasing(): -> a2 Error") # a2 return False elif c.isdigit() and strString[i+1] not in ['.','!','?','"','-','\'', ':',' ']: #if debug: # print("sentify.isGoodCasing -> text.lower()/upper() Error:") return False i=i+1 except: 1 return True def strip_accents(text): return "".join(char for char in unicodedata.normalize('NFKD', text) if unicodedata.category(char) != 'Mn') def parse_html(page): """ Clean HTML tags for webpages that aren't Gutenberg books """ try: # https://github.com/miso-belica/jusText/tree/dev/justext/stoplists parts = justext.justext(page, justext.get_stoplist('English')) #parts = justext.justext(page, justext.get_stoplist('German')) #print(parts) paragraphs = list() for part in parts: #pp.pprint(part.is_boilerplate) if not part.is_boilerplate: #pp.pprint(part) paragraphs.append(part.text) s=str('\n\n'.join(paragraphs)) if len(s) > 50: return s else: soup = BeautifulSoup(page, "html.parser") #"html.parser" "lxml") #comments = soup.findAll(text=lambda text:isinstance(text, Comment)) # remove comments #[comment.extract() for comment in comments] for script in soup.findAll(["script", "style", 'footer', 'head']): #script.extract() # rip it out script.decompose() # rip it out #[x.extract() for x in soup.findAll(['script', 'style'])] #[x.decompose() for x in soup.findAll(['script', 'style'])] soup.prettify() myText = soup.get_text() plaintextv1 = bleach.clean(myText, strip=True, strip_comments=True) plaintext = re.sub(r'<.*?>', '', plaintextv1) #plaintext = plaintext.replace('\n', ' ') #plaintext = plaintext.replace("\n", ' ') #plaintext = textify.removeDomainsSimple(plaintext) return plaintext#.strip() except Exception as e: #print("Unexpected error:", sys.exc_info()[0]) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] #print(exc_type, fname, exc_tb.tb_lineno) return str("") def encodeToASCII(text): encResults = text.encode('utf-8', "ignore") s_string = str(encResults.decode('ascii', "ignore")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string def encodeToUTF8Adv(text): encResults = text.encode('utf-8', "ignore") #return str(encResults.decode('latin-1', "ignore")) s_string = str(encResults.decode('utf-8', "remove")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string def encodeToLatin1(text): #text = text.replace('ß','ss') encResults = text.encode('utf-8', "ignore") #encResults = text.encode('utf-8', "ignore") s_string = str(encResults.decode('latin-1', "ignore")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string """ MAIN """ """ for key in manager: #print (key) # key #print (manager[key]) # value mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #print(f) with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() #tra = transformText(h) #data2 = beautifyCorpus(data1) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(managerDB, 'a+', encoding='ascii') as fa: #fa.write("manager###"+tra+"\n") fa.write("__label__MANAGER"+" "+data3+"\n") fa.close() for key in teenager: #print (key) # key #print (manager[key]) # value mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #print(f) with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() #tra = transformText(h) #data2 = beautifyCorpus(data1) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(teenagerDB, 'a+', encoding='ascii') as fa: #fa.write("teenager###"+tra+"\n") fa.write("__label__TEENAGER"+" "+data3+"\n") fa.close() for key in parents: #print (key) # key #print (manager[key]) # value mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #print(f) with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() #tra = transformText(h) #data2 = beautifyCorpus(data1) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(parentsDB, 'a+', encoding='ascii') as fa: #fa.write("parents###"+tra+"\n") fa.write("__label__PARENTS"+" "+data3+"\n") fa.close() folders = list() files = list() for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/manager/"): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: if f.find("en")!= -1: with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() data3 = parse_html(data3) ###tra = transformText(h) #data2 = beautifyCorpus(h) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(managerDB, 'a+', encoding='ascii') as fa: #fa.write("manager###"+tra+"\n") fa.write("__label__MANAGER"+" "+data3+"\n") fa.close() folders = list() files = list() for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/parents/"): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: if f.find("en")!= -1: with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() data3 = parse_html(data3) ###tra = transformText(h) #data2 = beautifyCorpus(h) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(parentsDB, 'a+', encoding='ascii') as fa: #fa.write("parents###"+tra+"\n") fa.write("__label__PARENTS"+" "+data3+"\n") fa.close() folders = list() files = list() for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/teenager/"): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: if f.find("en")!= -1: with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() data3 = parse_html(data3) ###tra = transformText(h) #data2 = beautifyCorpus(h) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(teenagerDB, 'a+', encoding='ascii') as fa: #fa.write("teenager###"+tra+"\n") fa.write("__label__TEENAGER"+" "+data3+"\n") fa.close() """ """ exit(1) """ """ print( p[0][0] ) print( type(p[0][0]) ) print() print() print( p[1].tolist() ) print( type(p[1].tolist()) ) """ exit(1) #fasttext.load_model(self.fasttext_model_file) #model.predict("Which baking dish is best to bake a banana bread ?", k=-1, threshold=0.5) #model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000) #model.save_model("/home/unaiqueFRAMEWORK/text_classify/fasttext/binary/unaique.ftz")