# read in some helpful libraries import nltk # the natural langauage toolkit, open-source NLP import pandas as pd # pandas dataframe import re # regular expression from nltk.corpus import stopwords from gensim.utils import lemmatize from gensim import parsing # Help in preprocessing the data, very efficiently import gensim import numpy as np import pprint import lxml import codecs import os,sys import gzip import json import os import unicodedata from multiprocessing.pool import ThreadPool from multiprocessing import Pool import concurrent.futures #import cdx_toolkit import pprint import lxml import codecs import os,sys from pathlib import Path import gzip import random import json import math import uuid import warcio # pip3 install --upgrade warcio from warcio.archiveiterator import ArchiveIterator import builtins vars(globals()['__builtins__']) is vars(builtins) from io import BytesIO, StringIO pp = pprint.PrettyPrinter(indent=4) import justext # pip install justext #import urllib.request import requests # pip3 install --upgrade requests #from urllib.parse import urlparse# pip3 install --upgrade urlparse #import urllib # url=urllib.unquote(url).decode('utf8') import urllib3 urllib3.disable_warnings() #urllib3.disable_warnings() import signal from contextlib import contextmanager from bs4 import BeautifulSoup, Comment import bleach #https://pypi.org/project/bleach/ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import model_selection, naive_bayes, svm from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.multiclass import OneVsRestClassifier from tabulate import tabulate from sklearn.svm import SVC from sklearn.svm import LinearSVC import logging import datetime import pprint import codecs import numpy as np import dill # wichtig für joblib load and dump import os import io from langdetect import detect # also https://github.com/saffsd/langid.py import nltk # the natural langauage toolkit, open-source NLP import pandas as pd # pandas dataframe import re # regular expression from time import time as timer from nltk.corpus import stopwords from gensim.utils import lemmatize from gensim import parsing # Help in preprocessing the data, very efficiently import gensim import numpy as np import pprint import lxml import codecs import os,sys import gzip import json import os import langid import multiprocessing as mpl import spacy # See "Installing spaCy" nlp_de = spacy.load('de_core_news_sm') nlp_en = spacy.load('en_core_web_sm') nlp_de.max_length = 10000000 nlp_en.max_length = 10000000 def _is_wordlike(tok): return tok.orth_ and tok.orth_[0].isalpha() def sentence_division_suppresor(doc): """Spacy pipeline component that prohibits sentence segmentation between two tokens that start with a letter. Useful for taming overzealous sentence segmentation in German model, possibly others as well.""" for i, tok in enumerate(doc[:-1]): if _is_wordlike(tok) and _is_wordlike(doc[i + 1]): doc[i + 1].is_sent_start = False return doc nlp_de.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser') nlp_en.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser') """ 500MB Quelldatei zum Training: 350 MB Manager 150 MB Teenager dann mittels colab 3h trainieren und die ergebnisse in google drive speichern """ # Sample random elements from file: perl -ne 'print if (rand() < .1)' manager_train.txt > manager_training.txt # sample random -> https://unix.stackexchange.com/questions/108581/how-to-randomly-sample-a-subset-of-a-file #cat teenink/* teenreads/* girlscouts/* > kids.txt #cat alphamom/* babble/* > parents.txt #cat bloomberg/* ceo/* chiefexecutive/* cio/* forbes/* foreignaffairs/* hbr/* inc/* managementexchange/* n2growth/* real-leaders/* strategy-business/* > manager.txt # cat bloomberg/1* >> manager.txt #https://www.kaggle.com/arunava21/word2vec-and-random-forest-classification storepathData= "/home/unaiqueFRAMEWORK/data" storepath = "/home/unaiqueFRAMEWORK/text_classify/training_data" #managerDB = storepath+"/gpt2_manager_train.txt" managerDB = "/home/unaiqueFRAMEWORK/new_prototyp/training/data/manager_train.txt" manager = {"bloomberg":'bloomberg.com', "cio":'cio.com', "hbr":'hbr.org', "forbes":'forbes.com', 'foreignaffairs':'foreignaffairs.com', 'ceo':'ceo.com','chiefexecutive':'chiefexecutive.net', 'economist':'economist.com', 'strategy-business':'strategy-business.com','managementexchange':'managementexchange.com', 'real-leaders':'real-leaders.com', 'inc':'inc.com', 'n2growth':'n2growth.com', "wsj" : 'wsj.com'} #https://stackoverflow.com/questions/12221387/how-to-extract-the-first-x-megabyte-from-a-large-file-in-unix-linux print(len(manager)) print() print(150/len(manager)) # 145 MB sind der gesamte Teenager Text #exit(1) def beautifyCorpus(text): return text """ rList = list() nlp_de.max_length = len(text) + 1 doc = nlp_de(text) for sent in doc.sents: s = str(sent) flag = isDublicateString(s) flag2 = isGoodCasing(s) if not flag and flag2: rList.append(s) return " ".join(rList) """ def isDublicateString(text): t_Text = text.lower() #debug = True patternElement = "" n = 16# neun zeichen müssen gleich sein if len(t_Text) >= n+3: patternElement = t_Text[:n] patternElement = patternElement.strip() #print("PatternElement:", patternElement) r_Count = t_Text.count(patternElement) if r_Count > 1: #if debug: # print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", patternElement) return True #satz = "Breath of the WildThe Legend of Zelda: Breath of the Wild auf der E3 angespielt." #['breath of th', 'e wildthe le', 'gend of zeld', 'a: breath of', ' the wild au', 'f der e3 ang', 'espielt.'] t_List = [t_Text[i:i+n] for i in range(0, len(t_Text), n)] for t in t_List: t = t.strip() r_Count = t_Text.count(t) if r_Count > 1: #if debug: # print("sentify.isNotDublicateString(): -> multiplePattern Error:", r_Count, ",", t) return True return False def isGoodCasing(text): if not isinstance(text, str): return False strString = text #str(text,'utf-8') i=0 try: for c in strString: if c in [u'ß',u'ö',u'ä',u'ü',u'Ö',u'Ä',u'Ü']: 1 elif c != " " and len(strString) >=i+1: if c.islower() and strString[i+1].isupper(): # zuviel positive falses #print("sentify.isGoodCasing(): -> aA Error") #aA return False if c.isdigit() and (strString[i+1].isupper() and strString[i+1].isalpha()): #if debug: # print("sentify.isGoodCasing(): -> 2P Error") return False elif c.isdigit() and strString[i+1].isalpha(): # to much positive falses #print("sentify.isGoodCasing(): -> 2a Error") #2a -> Achtung, eventuell bei Adressen als Ausgabe gibt es Probleme 1# return False elif c.isdigit() and strString[i+1].isdigit(): 1 # 11 elif c.isalpha() and strString[i+1].isdigit(): #if debug: # print("sentify.isGoodCasing(): -> a2 Error") # a2 return False elif c.isdigit() and strString[i+1] not in ['.','!','?','"','-','\'', ':',' ']: #if debug: # print("sentify.isGoodCasing -> text.lower()/upper() Error:") return False i=i+1 except: 1 return True def strip_accents(text): return "".join(char for char in unicodedata.normalize('NFKD', text) if unicodedata.category(char) != 'Mn') def parse_html(page): """ Clean HTML tags for webpages that aren't Gutenberg books """ try: # https://github.com/miso-belica/jusText/tree/dev/justext/stoplists parts = justext.justext(page, justext.get_stoplist('English')) #parts = justext.justext(page, justext.get_stoplist('German')) #print(parts) paragraphs = list() for part in parts: #pp.pprint(part.is_boilerplate) if not part.is_boilerplate: #pp.pprint(part) paragraphs.append(part.text) return str('\n\n'.join(paragraphs)) except Exception as e: # exc_type, exc_obj, exc_tb = sys.exc_info() # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] # print("Unexpected error: parse_html Error", sys.exc_info()[0]) # print(exc_type, fname, exc_tb.tb_lineno) # print() try: soup = BeautifulSoup(page, "html.parser") #"html.parser" "lxml") #comments = soup.findAll(text=lambda text:isinstance(text, Comment)) # remove comments #[comment.extract() for comment in comments] for script in soup.findAll(["script", "style", 'footer', 'head']): #script.extract() # rip it out script.decompose() # rip it out #[x.extract() for x in soup.findAll(['script', 'style'])] #[x.decompose() for x in soup.findAll(['script', 'style'])] soup.prettify() myText = soup.get_text() plaintextv1 = bleach.clean(myText, strip=True, strip_comments=True) plaintext = re.sub(r'<.*?>', '', plaintextv1) #plaintext = plaintext.replace('\n', ' ') #plaintext = plaintext.replace("\n", ' ') #plaintext = textify.removeDomainsSimple(plaintext) return plaintext#.strip() except (ValueError, KeyError, TypeError, Exception): #print("Unexpected error:", sys.exc_info()[0]) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] #print(exc_type, fname, exc_tb.tb_lineno) return str("") def encodeToASCII(text): encResults = text.encode('utf-8', "ignore") s_string = str(encResults.decode('ascii', "ignore")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string def encodeToLatin1(text): #text = text.replace('ß','ss') encResults = text.encode('utf-8', "ignore") #encResults = text.encode('utf-8', "ignore") s_string = str(encResults.decode('latin-1', "ignore")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string def encodeToUTF8Adv(text): encResults = text.encode('utf-8', "ignore") #return str(encResults.decode('latin-1', "ignore")) s_string = str(encResults.decode('utf-8', "remove")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string def encodeToUTF8(text): return text.encode('utf-8', "ignore") def transformText(text): # https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#stanfordcorenlplemmatization stops = set(stopwords.words("english")) # Convert text to lower text = text.lower() # Removing non ASCII chars text = re.sub(r'[^\x00-\x7f]',r' ',text) # Strip multiple whitespaces text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) # Removing all the stopwords filtered_words = [word for word in text.split() if word not in stops] # Removing all the tokens with lesser than 3 characters filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3) # Preprocessed text after stop words removal text = " ".join(filtered_words) # Remove the punctuation text = gensim.parsing.preprocessing.strip_punctuation2(text) # Strip all the numerics text = gensim.parsing.preprocessing.strip_numeric(text) # Strip multiple whitespaces text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) # Stemming #return gensim.parsing.preprocessing.stem_text(text) # Lemmatize t = [wd.decode('utf-8').split('/')[0] for wd in gensim.utils.lemmatize(text)] return str(" ".join(t)) """ MAIN https://minimaxir.com/2019/09/howto-gpt2/ """ for key in manager: print ("Walking1: ",key) # key print ("Walking2: ",manager[key]) # value tmp_string = str("") mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #if len(tmp_string) < 6500000: #print(f) with codecs.open(f, 'r', encoding='utf-8') as fr: data1 = fr.read().replace('\n', '').strip() #data1 = fr.readlines()#.replace('\n', '').strip() fr.close() #for d in data1: #data2 = beautifyCorpus(data1) #data2 = transformText(data1) #data3 = re.sub("^[A-Za-z0-9,.!?:;]", "", data2) #data3 = encodeToASCII(data3) #data3 = data3.replace('\n', '').strip().lower() with codecs.open(managerDB, 'a+', encoding='utf8') as fa: #fa.write("manager###"+tra+"\n") #fa.write(data3+"\n") fa.write(data1+"\n") fa.close() exit(1)