# -*- coding: utf-8 -*- #!/usr/bin/python3.6 -S -W ignore import os import codecs import os,sys import gzip import json import re import subprocess from fasttext import train_supervised import fasttext import pprint pp = pprint.PrettyPrinter(indent=4) import spacy # See "Installing spaCy" nlp_de = spacy.load('de_core_news_sm') nlp_en = spacy.load('en_core_web_sm') nlp_de.max_length = 10000000 nlp_en.max_length = 10000000 def _is_wordlike(tok): return tok.orth_ and tok.orth_[0].isalpha() def sentence_division_suppresor(doc): """Spacy pipeline component that prohibits sentence segmentation between two tokens that start with a letter. Useful for taming overzealous sentence segmentation in German model, possibly others as well.""" for i, tok in enumerate(doc[:-1]): if _is_wordlike(tok) and _is_wordlike(doc[i + 1]): doc[i + 1].is_sent_start = False return doc nlp_de.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser') nlp_en.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser') # cat cooking.stackexchange.txt | sed -e "s/([.\!?,'/()])/ 1 /g" | tr "[:upper:]" "[:lower:]" > cooking.preprocessed.txt storepathData = "/home/unaiqueFRAMEWORK/data" storepath = "/home/unaiqueFRAMEWORK/text_classify/fasttext/training_data" managerDB = storepath+"/manager_fasttext_train.txt" parentsDB = storepath+"/parents_fasttext_train.txt" teenagerDB = storepath+"/teenager_fasttext_train.txt" manager = {"bloomberg":'bloomberg.com', "cio":'cio.com', "hbr":'hbr.org', "forbes":'forbes.com', 'foreignaffairs':'foreignaffairs.com', 'ceo':'ceo.com','chiefexecutive':'chiefexecutive.net', 'economist':'economist.com', 'strategy-business':'strategy-business.com','managementexchange':'managementexchange.com', 'real-leaders':'real-leaders.com', 'inc':'inc.com', 'n2growth':'n2growth.com', "wsj" : 'wsj.com'} parents = {"babble" : 'babble.com', "alphamom":'alphamom.com'} teenager = {"teenink" : 'teenink.com', 'teenreads':'teenreads.com','girlscouts':'girlscouts.org','teenreads':'teenreads.com','teenkidsnews.com':'teenkidsnews','teensource':'teensource.org','teenvogue':'teenvogue.com','teenspeak':'teenspeak.org','teensgotcents':'teensgotcents.com','theteenagertoday':'theteenagertoday.com'} #https://gist.github.com/bittlingmayer/7139a6a75ba0dbbc3a06325394ae3a13 SUBEXES = ["s/’/'/g", "s/′/'/g", "s/''/ /g", "s/'/ ' /g", 's/“/"/g', 's/”/"/g', 's/"/ /g', "s/\\./ \\. /g", "s/
/ /g", "s/, / , /g", "s/(/ ( /g", "s/)/ ) /g", "s/\\!/ \\! /g", "s/\\?/ \\? /g", "s/\\;/ /g", "s/\\:/ /g", "s/-/ - /g", "s/=/ /g", "s/=/ /g", "s/*/ /g", "s/|/ /g", "s/«/ /g"] model = fasttext.load_model("/home/unaiqueFRAMEWORK/new_prototyp/fasttext/binary/unaique_full.bin") db_file = "/home/unaiqueFRAMEWORK/new_prototyp/fasttext/trainingsdata/train/manager_aa" # Training on new Sample csv_file = "/home/unaiqueFRAMEWORK/new_prototyp/fasttext/testing_manager.csv" # Training on new Sample current = "manager" w=codecs.open(csv_file, 'w', encoding='utf-8') w.write("is_label;predict_labal;is_probability;is_entry\n") # Open one result for processing with codecs.open(db_file, 'r', encoding='ascii') as fr: #text = fr.read().replace('\n', '').strip().lower() text = fr.readline() fr.close() print("Read File complete! Entries:", len(text)) #exit(1) count = 0 for t in text: #t = t.replace('\n', '').strip().lower() data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", t) try: p = model.predict(data3, k=1, threshold=0.0) label = str(p[0][0]) probability = str(float(p[1].tolist()[0])) print("Entries:", len(text)) print("Predicted:",label) print("Probability:",probability) print("Current:",current) print("Entry:",count) print("##########################") #w.write("is_label;predict_labal;is_probability;is_entry") count = count + 1 w.write(current+";"+label+";"+probability+";"+str(count)+"\n") except Exception as err2: 1 w.close() exit(1) def __normalize_text(s): for subex in SUBEXES: s = subprocess.check_output(['sed', subex], input=s.encode("latin-1")).decode("utf-8") # s = subprocess.check_output(['sed', subex], input=s.encode("iso-8859-1")).decode("utf-8") return s def __spaces(s): return ' '.join(s.split()) def __digits(s): return ''.join(filter(lambda c: not c.isdigit(), s)) def preproc(s): return __digits(__spaces(s.lower())) def beautifyCorpus(text): rList = list() nlp_de.max_length = len(text) + 1 doc = nlp_de(text) for sent in doc.sents: s = str(sent) flag = isDublicateString(s) flag2 = isGoodCasing(s) if not flag and flag2: rList.append(s) return " ".join(rList) def isDublicateString(text): t_Text = text.lower() #debug = True patternElement = "" n = 16# neun zeichen müssen gleich sein if len(t_Text) >= n+3: patternElement = t_Text[:n] patternElement = patternElement.strip() #print("PatternElement:", patternElement) r_Count = t_Text.count(patternElement) if r_Count > 1: #if debug: # print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", patternElement) return True #satz = "Breath of the WildThe Legend of Zelda: Breath of the Wild auf der E3 angespielt." #['breath of th', 'e wildthe le', 'gend of zeld', 'a: breath of', ' the wild au', 'f der e3 ang', 'espielt.'] t_List = [t_Text[i:i+n] for i in range(0, len(t_Text), n)] for t in t_List: t = t.strip() r_Count = t_Text.count(t) if r_Count > 1: #if debug: # print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", t) return True return False def isGoodCasing(text): if not isinstance(text, str): return False strString = text #str(text,'utf-8') i=0 try: for c in strString: if c in [u'ß',u'ö',u'ä',u'ü',u'Ö',u'Ä',u'Ü']: 1 elif c != " " and len(strString) >=i+1: if c.islower() and strString[i+1].isupper(): # zuviel positive falses #print("sentify.isGoodCasing(): -> aA Error") #aA return False if c.isdigit() and (strString[i+1].isupper() and strString[i+1].isalpha()): #if debug: # print("sentify.isGoodCasing(): -> 2P Error") return False elif c.isdigit() and strString[i+1].isalpha(): # to much positive falses #print("sentify.isGoodCasing(): -> 2a Error") #2a -> Achtung, eventuell bei Adressen als Ausgabe gibt es Probleme 1# return False elif c.isdigit() and strString[i+1].isdigit(): 1 # 11 elif c.isalpha() and strString[i+1].isdigit(): #if debug: # print("sentify.isGoodCasing(): -> a2 Error") # a2 return False elif c.isdigit() and strString[i+1] not in ['.','!','?','"','-','\'', ':',' ']: #if debug: # print("sentify.isGoodCasing -> text.lower()/upper() Error:") return False i=i+1 except: 1 return True def strip_accents(text): return "".join(char for char in unicodedata.normalize('NFKD', text) if unicodedata.category(char) != 'Mn') def parse_html(page): """ Clean HTML tags for webpages that aren't Gutenberg books """ try: # https://github.com/miso-belica/jusText/tree/dev/justext/stoplists parts = justext.justext(page, justext.get_stoplist('English')) #parts = justext.justext(page, justext.get_stoplist('German')) #print(parts) paragraphs = list() for part in parts: #pp.pprint(part.is_boilerplate) if not part.is_boilerplate: #pp.pprint(part) paragraphs.append(part.text) s=str('\n\n'.join(paragraphs)) if len(s) > 50: return s else: soup = BeautifulSoup(page, "html.parser") #"html.parser" "lxml") #comments = soup.findAll(text=lambda text:isinstance(text, Comment)) # remove comments #[comment.extract() for comment in comments] for script in soup.findAll(["script", "style", 'footer', 'head']): #script.extract() # rip it out script.decompose() # rip it out #[x.extract() for x in soup.findAll(['script', 'style'])] #[x.decompose() for x in soup.findAll(['script', 'style'])] soup.prettify() myText = soup.get_text() plaintextv1 = bleach.clean(myText, strip=True, strip_comments=True) plaintext = re.sub(r'<.*?>', '', plaintextv1) #plaintext = plaintext.replace('\n', ' ') #plaintext = plaintext.replace("\n", ' ') #plaintext = textify.removeDomainsSimple(plaintext) return plaintext#.strip() except Exception as e: #print("Unexpected error:", sys.exc_info()[0]) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] #print(exc_type, fname, exc_tb.tb_lineno) return str("") def encodeToASCII(text): encResults = text.encode('utf-8', "ignore") s_string = str(encResults.decode('ascii', "ignore")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string def encodeToUTF8Adv(text): encResults = text.encode('utf-8', "ignore") #return str(encResults.decode('latin-1', "ignore")) s_string = str(encResults.decode('utf-8', "remove")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string def encodeToLatin1(text): #text = text.replace('ß','ss') encResults = text.encode('utf-8', "ignore") #encResults = text.encode('utf-8', "ignore") s_string = str(encResults.decode('latin-1', "ignore")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string """ MAIN """ """ for key in manager: #print (key) # key #print (manager[key]) # value mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #print(f) with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() #tra = transformText(h) #data2 = beautifyCorpus(data1) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(managerDB, 'a+', encoding='ascii') as fa: #fa.write("manager###"+tra+"\n") fa.write("__label__MANAGER"+" "+data3+"\n") fa.close() for key in teenager: #print (key) # key #print (manager[key]) # value mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #print(f) with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() #tra = transformText(h) #data2 = beautifyCorpus(data1) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(teenagerDB, 'a+', encoding='ascii') as fa: #fa.write("teenager###"+tra+"\n") fa.write("__label__TEENAGER"+" "+data3+"\n") fa.close() for key in parents: #print (key) # key #print (manager[key]) # value mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #print(f) with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() #tra = transformText(h) #data2 = beautifyCorpus(data1) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(parentsDB, 'a+', encoding='ascii') as fa: #fa.write("parents###"+tra+"\n") fa.write("__label__PARENTS"+" "+data3+"\n") fa.close() folders = list() files = list() for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/manager/"): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: if f.find("en")!= -1: with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() data3 = parse_html(data3) ###tra = transformText(h) #data2 = beautifyCorpus(h) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(managerDB, 'a+', encoding='ascii') as fa: #fa.write("manager###"+tra+"\n") fa.write("__label__MANAGER"+" "+data3+"\n") fa.close() folders = list() files = list() for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/parents/"): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: if f.find("en")!= -1: with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() data3 = parse_html(data3) ###tra = transformText(h) #data2 = beautifyCorpus(h) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(parentsDB, 'a+', encoding='ascii') as fa: #fa.write("parents###"+tra+"\n") fa.write("__label__PARENTS"+" "+data3+"\n") fa.close() folders = list() files = list() for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/teenager/"): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: if f.find("en")!= -1: with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() data3 = parse_html(data3) ###tra = transformText(h) #data2 = beautifyCorpus(h) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(teenagerDB, 'a+', encoding='ascii') as fa: #fa.write("teenager###"+tra+"\n") fa.write("__label__TEENAGER"+" "+data3+"\n") fa.close() """ """ exit(1) """ """ print( p[0][0] ) print( type(p[0][0]) ) print() print() print( p[1].tolist() ) print( type(p[1].tolist()) ) """ exit(1) #fasttext.load_model(self.fasttext_model_file) #model.predict("Which baking dish is best to bake a banana bread ?", k=-1, threshold=0.5) #model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000) #model.save_model("/home/unaiqueFRAMEWORK/text_classify/fasttext/binary/unaique.ftz")