# -*- coding: utf-8 -*- #!/usr/bin/python3.6 -S -W ignore import os import codecs import os,sys import gzip import json import re import subprocess from fasttext import train_supervised # pip3 install -U fasttext import fasttext from gensim.models import FastText from keras.preprocessing.text import Tokenizer from gensim.models.fasttext import FastText import numpy as np import matplotlib.pyplot as plt import nltk from string import punctuation from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from nltk.tokenize import sent_tokenize from nltk import WordPunctTokenizer import re from nltk.stem import WordNetLemmatizer #import wikipedia import nltk nltk.download('punkt') nltk.download('wordnet') nltk.download('stopwords') en_stop = set(nltk.corpus.stopwords.words('english')) import spacy # See "Installing spaCy" nlp_de = spacy.load('de_core_news_sm') nlp_en = spacy.load('en_core_web_sm') nlp_de.max_length = 10000000 nlp_en.max_length = 10000000 stemmer = WordNetLemmatizer() def preprocess_text(document): # Remove all the special characters document = re.sub(r'\W', ' ', str(document)) # remove all single characters document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document) # Remove single characters from the start document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) # Substituting multiple spaces with single space document = re.sub(r'\s+', ' ', document, flags=re.I) # Removing prefixed 'b' document = re.sub(r'^b\s+', '', document) # Converting to Lowercase document = document.lower() # Lemmatization tokens = document.split() tokens = [stemmer.lemmatize(word) for word in tokens] tokens = [word for word in tokens if word not in en_stop] tokens = [word for word in tokens if len(word) > 3] preprocessed_text = ' '.join(tokens) return preprocessed_text def _is_wordlike(tok): return tok.orth_ and tok.orth_[0].isalpha() def sentence_division_suppresor(doc): """Spacy pipeline component that prohibits sentence segmentation between two tokens that start with a letter. Useful for taming overzealous sentence segmentation in German model, possibly others as well.""" for i, tok in enumerate(doc[:-1]): if _is_wordlike(tok) and _is_wordlike(doc[i + 1]): doc[i + 1].is_sent_start = False return doc nlp_de.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser') nlp_en.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser') # via Shell filtern #cat unaique.all.txt | sed -e "s/([.\!?,'/()])/ 1 /g" | tr "[:upper:]" "[:lower:]" >> unaique.preprocessed.txt # https://github.com/facebookresearch/fastText/blob/master/docs/supervised-tutorial.md # http://soner.in/fasttext-grid-search/ # https://github.com/facebookresearch/fastText/blob/master/python/fasttext_module/fasttext/FastText.py #train_data_tmp="/home/unaiqueFRAMEWORK/new_prototyp/fasttext/trainingsdata/train/unaique.preprocessed.txt" train_data="/home/unaiqueFRAMEWORK/new_prototyp/fasttext/trainingsdata/train/unaique.preprocessed.txt" with codecs.open(train_data, 'r', encoding='utf-8') as fr: data1 = fr.read().replace('\n', '').strip() #data1 = fr.readlines()#.replace('\n', '').strip() fr.close() artificial_intelligence = sent_tokenize(data1) final_corpus = [preprocess_text(sentence) for sentence in artificial_intelligence if sentence.strip() !=''] word_punctuation_tokenizer = nltk.WordPunctTokenizer() word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in final_corpus] embedding_size = 60 window_size = 40 min_word = 5 down_sampling = 1e-2 ft_model = FastText(word_tokenized_corpus, size=embedding_size, window=window_size, min_count=min_word, sample=down_sampling, sg=1, iter=100) semantically_similar_words = {words: [item[0] for item in ft_model.wv.most_similar([words], topn=5)] for words in ['artificial', 'intelligence', 'machine', 'network', 'recurrent', 'deep']} for k,v in semantically_similar_words.items(): print(k+":"+str(v)) print(ft_model.wv.similarity(w1='artificial', w2='intelligence')) exit(1) model = FastText(size=4, window=3, min_count=1) # instantiate model.build_vocab(sentences=common_texts) model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10) # train """ r=codecs.open(train_data_tmp, 'r', encoding='utf-8') data3 = r.read().replace('\n', '').strip().lower() r.close() data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data3) w=codecs.open(train_data, 'w', encoding='ascii') w.write(data3) w.close() """ #model = fasttext.train_supervised(train_data, model='skipgram', lr=0.05, dim=250, ws=5, epoch=50) #model = fasttext.train_supervised(train_data, epoch=100, lr=0.25, wordNgrams=3, verbose=2, dim=300, ws=25, minCount=1, loss="softmax") #model = fasttext.train_supervised(train_data, epoch=50, lr=0.1, wordNgrams=3, verbose=2, minCount=1, dim=200, ws=10, loss='hs', thread=10) model = fasttext.train_supervised(train_data, epoch=100, lr=0.05, wordNgrams=3, thread=10, label_prefix='__label__') model.save_model("/home/unaiqueFRAMEWORK/new_prototyp/fasttext/unaique_full.bin") #train_data2="/home/unaiqueFRAMEWORK/new_prototyp/fasttext/trainingsdata/teenager_fasttext70.txt" #model = fasttext.train_unsupervised(train_data2, model='skipgram', lr=0.05, dim=200, ws=5, epoch=50) #model.save_model("/home/unaiqueFRAMEWORK/new_prototyp/fasttext/binary/teenager_fasttext.bin") exit(1) #fasttext.load_model(self.fasttext_model_file) #model.predict("Which baking dish is best to bake a banana bread ?", k=-1, threshold=0.5) model.quantize(input=train_data, qnorm=True, retrain=False, cutoff=50000) model.save_model("/home/unaiqueFRAMEWORK/new_prototyp/fasttext/unaique_small.ftz") exit(1) def __normalize_text(s): for subex in SUBEXES: s = subprocess.check_output(['sed', subex], input=s.encode("latin-1")).decode("utf-8") # s = subprocess.check_output(['sed', subex], input=s.encode("iso-8859-1")).decode("utf-8") return s def __spaces(s): return ' '.join(s.split()) def __digits(s): return ''.join(filter(lambda c: not c.isdigit(), s)) def preproc(s): return __digits(__spaces(s.lower())) def beautifyCorpus(text): rList = list() nlp_de.max_length = len(text) + 1 doc = nlp_de(text) for sent in doc.sents: s = str(sent) flag = isDublicateString(s) flag2 = isGoodCasing(s) if not flag and flag2: rList.append(s) return " ".join(rList) def isDublicateString(text): t_Text = text.lower() #debug = True patternElement = "" n = 16# neun zeichen müssen gleich sein if len(t_Text) >= n+3: patternElement = t_Text[:n] patternElement = patternElement.strip() #print("PatternElement:", patternElement) r_Count = t_Text.count(patternElement) if r_Count > 1: #if debug: # print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", patternElement) return True #satz = "Breath of the WildThe Legend of Zelda: Breath of the Wild auf der E3 angespielt." #['breath of th', 'e wildthe le', 'gend of zeld', 'a: breath of', ' the wild au', 'f der e3 ang', 'espielt.'] t_List = [t_Text[i:i+n] for i in range(0, len(t_Text), n)] for t in t_List: t = t.strip() r_Count = t_Text.count(t) if r_Count > 1: #if debug: # print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", t) return True return False def isGoodCasing(text): if not isinstance(text, str): return False strString = text #str(text,'utf-8') i=0 try: for c in strString: if c in [u'ß',u'ö',u'ä',u'ü',u'Ö',u'Ä',u'Ü']: 1 elif c != " " and len(strString) >=i+1: if c.islower() and strString[i+1].isupper(): # zuviel positive falses #print("sentify.isGoodCasing(): -> aA Error") #aA return False if c.isdigit() and (strString[i+1].isupper() and strString[i+1].isalpha()): #if debug: # print("sentify.isGoodCasing(): -> 2P Error") return False elif c.isdigit() and strString[i+1].isalpha(): # to much positive falses #print("sentify.isGoodCasing(): -> 2a Error") #2a -> Achtung, eventuell bei Adressen als Ausgabe gibt es Probleme 1# return False elif c.isdigit() and strString[i+1].isdigit(): 1 # 11 elif c.isalpha() and strString[i+1].isdigit(): #if debug: # print("sentify.isGoodCasing(): -> a2 Error") # a2 return False elif c.isdigit() and strString[i+1] not in ['.','!','?','"','-','\'', ':',' ']: #if debug: # print("sentify.isGoodCasing -> text.lower()/upper() Error:") return False i=i+1 except: 1 return True def strip_accents(text): return "".join(char for char in unicodedata.normalize('NFKD', text) if unicodedata.category(char) != 'Mn') def parse_html(page): """ Clean HTML tags for webpages that aren't Gutenberg books """ try: # https://github.com/miso-belica/jusText/tree/dev/justext/stoplists parts = justext.justext(page, justext.get_stoplist('English')) #parts = justext.justext(page, justext.get_stoplist('German')) #print(parts) paragraphs = list() for part in parts: #pp.pprint(part.is_boilerplate) if not part.is_boilerplate: #pp.pprint(part) paragraphs.append(part.text) s=str('\n\n'.join(paragraphs)) if len(s) > 50: return s else: soup = BeautifulSoup(page, "html.parser") #"html.parser" "lxml") #comments = soup.findAll(text=lambda text:isinstance(text, Comment)) # remove comments #[comment.extract() for comment in comments] for script in soup.findAll(["script", "style", 'footer', 'head']): #script.extract() # rip it out script.decompose() # rip it out #[x.extract() for x in soup.findAll(['script', 'style'])] #[x.decompose() for x in soup.findAll(['script', 'style'])] soup.prettify() myText = soup.get_text() plaintextv1 = bleach.clean(myText, strip=True, strip_comments=True) plaintext = re.sub(r'<.*?>', '', plaintextv1) #plaintext = plaintext.replace('\n', ' ') #plaintext = plaintext.replace("\n", ' ') #plaintext = textify.removeDomainsSimple(plaintext) return plaintext#.strip() except Exception as e: #print("Unexpected error:", sys.exc_info()[0]) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] #print(exc_type, fname, exc_tb.tb_lineno) return str("") def encodeToASCII(text): encResults = text.encode('utf-8', "ignore") s_string = str(encResults.decode('ascii', "ignore")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string def encodeToUTF8Adv(text): encResults = text.encode('utf-8', "ignore") #return str(encResults.decode('latin-1', "ignore")) s_string = str(encResults.decode('utf-8', "remove")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string def encodeToLatin1(text): #text = text.replace('ß','ss') encResults = text.encode('utf-8', "ignore") #encResults = text.encode('utf-8', "ignore") s_string = str(encResults.decode('latin-1', "ignore")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string """ MAIN """ """ for key in manager: #print (key) # key #print (manager[key]) # value mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #print(f) with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() #tra = transformText(h) #data2 = beautifyCorpus(data1) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(managerDB, 'a+', encoding='ascii') as fa: #fa.write("manager###"+tra+"\n") fa.write("__label__MANAGER"+" "+data3+"\n") fa.close() for key in teenager: #print (key) # key #print (manager[key]) # value mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #print(f) with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() #tra = transformText(h) #data2 = beautifyCorpus(data1) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(teenagerDB, 'a+', encoding='ascii') as fa: #fa.write("teenager###"+tra+"\n") fa.write("__label__TEENAGER"+" "+data3+"\n") fa.close() for key in parents: #print (key) # key #print (manager[key]) # value mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #print(f) with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() #tra = transformText(h) #data2 = beautifyCorpus(data1) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(parentsDB, 'a+', encoding='ascii') as fa: #fa.write("parents###"+tra+"\n") fa.write("__label__PARENTS"+" "+data3+"\n") fa.close() folders = list() files = list() for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/manager/"): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: if f.find("en")!= -1: with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() data3 = parse_html(data3) ###tra = transformText(h) #data2 = beautifyCorpus(h) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(managerDB, 'a+', encoding='ascii') as fa: #fa.write("manager###"+tra+"\n") fa.write("__label__MANAGER"+" "+data3+"\n") fa.close() folders = list() files = list() for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/parents/"): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: if f.find("en")!= -1: with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() data3 = parse_html(data3) ###tra = transformText(h) #data2 = beautifyCorpus(h) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(parentsDB, 'a+', encoding='ascii') as fa: #fa.write("parents###"+tra+"\n") fa.write("__label__PARENTS"+" "+data3+"\n") fa.close() folders = list() files = list() for entry in os.scandir("/home/unaiqueFRAMEWORK/text_classify/training_data/teenager/"): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: if f.find("en")!= -1: with codecs.open(f, 'r', encoding='utf-8') as fr: data3 = fr.read().replace('\n', '').strip() fr.close() data3 = parse_html(data3) ###tra = transformText(h) #data2 = beautifyCorpus(h) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) data3 = encodeToASCII(data3) #data3 = preproc(data3) if len(data3) > 50: with codecs.open(teenagerDB, 'a+', encoding='ascii') as fa: #fa.write("teenager###"+tra+"\n") fa.write("__label__TEENAGER"+" "+data3+"\n") fa.close() """ """ exit(1) """