# -*- coding: utf-8 -*- #!/usr/bin/python3.6 -S -W ignore import os import codecs import os,sys import gzip import json import re import subprocess from fasttext import train_supervised # pip3 install -U fasttext import fasttext import spacy # See "Installing spaCy" nlp_de = spacy.load('de_core_news_sm') nlp_en = spacy.load('en_core_web_sm') nlp_de.max_length = 10000000 nlp_en.max_length = 10000000 def _is_wordlike(tok): return tok.orth_ and tok.orth_[0].isalpha() def sentence_division_suppresor(doc): """Spacy pipeline component that prohibits sentence segmentation between two tokens that start with a letter. Useful for taming overzealous sentence segmentation in German model, possibly others as well.""" for i, tok in enumerate(doc[:-1]): if _is_wordlike(tok) and _is_wordlike(doc[i + 1]): doc[i + 1].is_sent_start = False return doc nlp_de.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser') nlp_en.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser') # via Shell filtern #cat unaique.txt | sed -e "s/([.\!?,'/()])/ 1 /g" | tr "[:upper:]" "[:lower:]" >> unaique.preprocessed.txt # https://github.com/facebookresearch/fastText/blob/master/docs/supervised-tutorial.md # http://soner.in/fasttext-grid-search/ # https://github.com/facebookresearch/fastText/blob/master/python/fasttext_module/fasttext/FastText.py #train_data_tmp="/home/unaiqueFRAMEWORK/new_prototyp/fasttext/trainingsdata/train/unaique.preprocessed.txt" train_data="/home/seo-auto-scaler/data/data.txt" """ r=codecs.open(train_data_tmp, 'r', encoding='utf-8') data3 = r.read().replace('\n', '').strip().lower() r.close() data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data3) w=codecs.open(train_data, 'w', encoding='ascii') w.write(data3) w.close() """ #model = fasttext.train_supervised(train_data, model='skipgram', lr=0.05, dim=250, ws=5, epoch=50) #model = fasttext.train_supervised(train_data, epoch=100, lr=0.25, wordNgrams=3, verbose=2, dim=300, ws=25, minCount=1, loss="softmax") #model = fasttext.train_supervised(train_data, epoch=50, lr=0.1, wordNgrams=3, verbose=2, minCount=1, dim=200, ws=10, loss='hs', thread=10) model = fasttext.train_supervised(train_data, epoch=100, lr=0.25, wordNgrams=3, thread=10, label_prefix='__label__') model.save_model("/home/seo-auto-scaler/data/model/seo.bin") #train_data2="/home/unaiqueFRAMEWORK/new_prototyp/fasttext/trainingsdata/teenager_fasttext70.txt" #model = fasttext.train_unsupervised(train_data2, model='skipgram', lr=0.05, dim=200, ws=5, epoch=50) #model.save_model("/home/unaiqueFRAMEWORK/new_prototyp/fasttext/binary/teenager_fasttext.bin") exit(1) #fasttext.load_model(self.fasttext_model_file) #model.predict("Which baking dish is best to bake a banana bread ?", k=-1, threshold=0.5) model.quantize(input=train_data, qnorm=True, retrain=False, cutoff=50000) model.save_model("/home/unaiqueFRAMEWORK/new_prototyp/fasttext/unaique_small.ftz") exit(1) def __normalize_text(s): for subex in SUBEXES: s = subprocess.check_output(['sed', subex], input=s.encode("latin-1")).decode("utf-8") # s = subprocess.check_output(['sed', subex], input=s.encode("iso-8859-1")).decode("utf-8") return s def __spaces(s): return ' '.join(s.split()) def __digits(s): return ''.join(filter(lambda c: not c.isdigit(), s)) def preproc(s): return __digits(__spaces(s.lower())) def beautifyCorpus(text): rList = list() nlp_de.max_length = len(text) + 1 doc = nlp_de(text) for sent in doc.sents: s = str(sent) flag = isDublicateString(s) flag2 = isGoodCasing(s) if not flag and flag2: rList.append(s) return " ".join(rList) def isDublicateString(text): t_Text = text.lower() #debug = True patternElement = "" n = 16# neun zeichen müssen gleich sein if len(t_Text) >= n+3: patternElement = t_Text[:n] patternElement = patternElement.strip() #print("PatternElement:", patternElement) r_Count = t_Text.count(patternElement) if r_Count > 1: #if debug: # print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", patternElement) return True #satz = "Breath of the WildThe Legend of Zelda: Breath of the Wild auf der E3 angespielt." #['breath of th', 'e wildthe le', 'gend of zeld', 'a: breath of', ' the wild au', 'f der e3 ang', 'espielt.'] t_List = [t_Text[i:i+n] for i in range(0, len(t_Text), n)] for t in t_List: t = t.strip() r_Count = t_Text.count(t) if r_Count > 1: #if debug: # print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", t) return True return False def isGoodCasing(text): if not isinstance(text, str): return False strString = text #str(text,'utf-8') i=0 try: for c in strString: if c in [u'ß',u'ö',u'ä',u'ü',u'Ö',u'Ä',u'Ü']: 1 elif c != " " and len(strString) >=i+1: if c.islower() and strString[i+1].isupper(): # zuviel positive falses #print("sentify.isGoodCasing(): -> aA Error") #aA return False if c.isdigit() and (strString[i+1].isupper() and strString[i+1].isalpha()): #if debug: # print("sentify.isGoodCasing(): -> 2P Error") return False elif c.isdigit() and strString[i+1].isalpha(): # to much positive falses #print("sentify.isGoodCasing(): -> 2a Error") #2a -> Achtung, eventuell bei Adressen als Ausgabe gibt es Probleme 1# return False elif c.isdigit() and strString[i+1].isdigit(): 1 # 11 elif c.isalpha() and strString[i+1].isdigit(): #if debug: # print("sentify.isGoodCasing(): -> a2 Error") # a2 return False elif c.isdigit() and strString[i+1] not in ['.','!','?','"','-','\'', ':',' ']: #if debug: # print("sentify.isGoodCasing -> text.lower()/upper() Error:") return False i=i+1 except: 1 return True def strip_accents(text): return "".join(char for char in unicodedata.normalize('NFKD', text) if unicodedata.category(char) != 'Mn') def parse_html(page): """ Clean HTML tags for webpages that aren't Gutenberg books """ try: # https://github.com/miso-belica/jusText/tree/dev/justext/stoplists parts = justext.justext(page, justext.get_stoplist('English')) #parts = justext.justext(page, justext.get_stoplist('German')) #print(parts) paragraphs = list() for part in parts: #pp.pprint(part.is_boilerplate) if not part.is_boilerplate: #pp.pprint(part) paragraphs.append(part.text) s=str('\n\n'.join(paragraphs)) if len(s) > 50: return s else: soup = BeautifulSoup(page, "html.parser") #"html.parser" "lxml") #comments = soup.findAll(text=lambda text:isinstance(text, Comment)) # remove comments #[comment.extract() for comment in comments] for script in soup.findAll(["script", "style", 'footer', 'head']): #script.extract() # rip it out script.decompose() # rip it out #[x.extract() for x in soup.findAll(['script', 'style'])] #[x.decompose() for x in soup.findAll(['script', 'style'])] soup.prettify() myText = soup.get_text() plaintextv1 = bleach.clean(myText, strip=True, strip_comments=True) plaintext = re.sub(r'<.*?>', '', plaintextv1) #plaintext = plaintext.replace('\n', ' ') #plaintext = plaintext.replace("\n", ' ') #plaintext = textify.removeDomainsSimple(plaintext) return plaintext#.strip() except Exception as e: #print("Unexpected error:", sys.exc_info()[0]) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] #print(exc_type, fname, exc_tb.tb_lineno) return str("") def encodeToASCII(text): encResults = text.encode('utf-8', "ignore") s_string = str(encResults.decode('ascii', "ignore")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string def encodeToUTF8Adv(text): encResults = text.encode('utf-8', "ignore") #return str(encResults.decode('latin-1', "ignore")) s_string = str(encResults.decode('utf-8', "remove")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string def encodeToLatin1(text): #text = text.replace('ß','ss') encResults = text.encode('utf-8', "ignore") #encResults = text.encode('utf-8', "ignore") s_string = str(encResults.decode('latin-1', "ignore")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string """ MAIN """ """ for key in manager: #print (key) # key #print (manager[key]) # value mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #print(f) with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() #tra = transformText(h) #data2 = beautifyCorpus(data1) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(managerDB, 'a+', encoding='ascii') as fa: #fa.write("manager###"+tra+"\n") fa.write("__label__MANAGER"+" "+data3+"\n") fa.close() for key in teenager: #print (key) # key #print (manager[key]) # value mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #print(f) with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() #tra = transformText(h) #data2 = beautifyCorpus(data1) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(teenagerDB, 'a+', encoding='ascii') as fa: #fa.write("teenager###"+tra+"\n") fa.write("__label__TEENAGER"+" "+data3+"\n") fa.close() for key in parents: #print (key) # key #print (manager[key]) # value mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #print(f) with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() #tra = transformText(h) #data2 = beautifyCorpus(data1) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(parentsDB, 'a+', encoding='ascii') as fa: #fa.write("parents###"+tra+"\n") fa.write("__label__PARENTS"+" "+data3+"\n") fa.close() folders = list() files = list() for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/manager/"): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: if f.find("en")!= -1: with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() data3 = parse_html(data3) ###tra = transformText(h) #data2 = beautifyCorpus(h) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(managerDB, 'a+', encoding='ascii') as fa: #fa.write("manager###"+tra+"\n") fa.write("__label__MANAGER"+" "+data3+"\n") fa.close() folders = list() files = list() for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/parents/"): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: if f.find("en")!= -1: with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() data3 = parse_html(data3) ###tra = transformText(h) #data2 = beautifyCorpus(h) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(parentsDB, 'a+', encoding='ascii') as fa: #fa.write("parents###"+tra+"\n") fa.write("__label__PARENTS"+" "+data3+"\n") fa.close() folders = list() files = list() for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/teenager/"): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: if f.find("en")!= -1: with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() data3 = parse_html(data3) ###tra = transformText(h) #data2 = beautifyCorpus(h) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(teenagerDB, 'a+', encoding='ascii') as fa: #fa.write("teenager###"+tra+"\n") fa.write("__label__TEENAGER"+" "+data3+"\n") fa.close() """ """ exit(1) """