# read in some helpful libraries import nltk # the natural langauage toolkit, open-source NLP import pandas as pd # pandas dataframe import re # regular expression from nltk.corpus import stopwords from gensim.utils import lemmatize from gensim import parsing # Help in preprocessing the data, very efficiently import gensim import numpy as np import pprint import lxml import codecs import os,sys import gzip import json import os import unicodedata from multiprocessing.pool import ThreadPool from multiprocessing import Pool import concurrent.futures #import cdx_toolkit import pprint import lxml import codecs import os,sys from pathlib import Path import gzip import random import json import math import uuid import warcio # pip3 install --upgrade warcio from warcio.archiveiterator import ArchiveIterator import builtins vars(globals()['__builtins__']) is vars(builtins) from io import BytesIO, StringIO pp = pprint.PrettyPrinter(indent=4) import justext # pip install justext #import urllib.request import requests # pip3 install --upgrade requests #from urllib.parse import urlparse# pip3 install --upgrade urlparse #import urllib # url=urllib.unquote(url).decode('utf8') import urllib3 urllib3.disable_warnings() #urllib3.disable_warnings() import signal from contextlib import contextmanager from bs4 import BeautifulSoup, Comment import bleach #https://pypi.org/project/bleach/ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import model_selection, naive_bayes, svm from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.multiclass import OneVsRestClassifier from tabulate import tabulate from sklearn.svm import SVC from sklearn.svm import LinearSVC import logging import datetime import pprint import codecs import numpy as np import dill # wichtig für joblib load and dump import os import io from langdetect import detect # also https://github.com/saffsd/langid.py import nltk # the natural langauage toolkit, open-source NLP import pandas as pd # pandas dataframe import re # regular expression from time import time as timer from nltk.corpus import stopwords from gensim.utils import lemmatize from gensim import parsing # Help in preprocessing the data, very efficiently import gensim import numpy as np import pprint import lxml import codecs import os,sys import gzip import json import os import langid import multiprocessing as mpl import os import re #from weasyprint import HTML # https://weasyprint.readthedocs.io/en/stable/tutorial.html import glob import webvtt import codecs def transformText(text): # https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#stanfordcorenlplemmatization stops = set(stopwords.words("english")) # Convert text to lower text = text.lower() # Removing non ASCII chars text = re.sub(r'[^\x00-\x7f]',r' ',text) # Strip multiple whitespaces text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) # Removing all the stopwords filtered_words = [word for word in text.split() if word not in stops] # Removing all the tokens with lesser than 3 characters filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3) # Preprocessed text after stop words removal text = " ".join(filtered_words) # Remove the punctuation text = gensim.parsing.preprocessing.strip_punctuation2(text) # Strip all the numerics text = gensim.parsing.preprocessing.strip_numeric(text) # Strip multiple whitespaces text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) # Stemming #return gensim.parsing.preprocessing.stem_text(text) # Lemmatize t = [wd.decode('utf-8').split('/')[0] for wd in gensim.utils.lemmatize(text)] return str(" ".join(t)) def encodeToASCII(text): encResults = text.encode('utf-8', "ignore") s_string = str(encResults.decode('ascii', "ignore")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string # pip3 install -U webvtt-py #fn = symbol+"-"+"10k-q4-2019-it.htm" #fn_pdf = symbol+"-"+"10k-q4-2019-it.pdf" myResultFile = "/home/unaiqueFRAMEWORK/new_prototyp/training/data/teenager_train.txt" myDownloadDirL = "/home/unaiqueFRAMEWORK/research/youtube/*.vtt" myDir = glob.glob(myDownloadDirL) #https://stackoverflow.com/questions/12221387/how-to-extract-the-first-x-megabyte-from-a-large-file-in-unix-linux for f in myDir: #print(f) #e = f.split("/") filename = os.path.basename(f) #print(filename) capList = list() capAllList = list() capString = str("") for caption in webvtt.read(f): #print(caption.start) #print(caption.end) #print(caption.text) capString = capString +", "+ (caption.text).strip() c=(caption.text).strip() capList.append(c) capAllList.append( " ".join(capList) ) #capString = transformText(capString) #capString = re.sub("^[A-Za-z0-9,.!?:;]", "", capString) #capString = encodeToASCII(capString) #capString = capString.replace('\n', '').strip().lower() with codecs.open(myResultFile, 'a+', encoding='utf8') as fa: #fa.write("teenager###"+tra+"\n") #fa.write(capString+"\n") ca = str("\n".join(capAllList)) fa.write(ca) fa.close() """ with open(myResultFile, 'a+', encoding='utf-8') as fp: fp.write(capString+"\n") fp.close() """ capString = str("")