# read in some helpful libraries import nltk # the natural langauage toolkit, open-source NLP import pandas as pd # pandas dataframe import re # regular expression from nltk.corpus import stopwords from gensim.utils import lemmatize from gensim import parsing # Help in preprocessing the data, very efficiently import gensim import numpy as np import pprint import lxml import codecs import os,sys import gzip import json import os # Sample random elements from file: perl -ne 'print if (rand() < .1)' manager_train.txt > manager_training.txt # sample random -> https://unix.stackexchange.com/questions/108581/how-to-randomly-sample-a-subset-of-a-file #cat teenink/* teenreads/* girlscouts/* > kids.txt #cat alphamom/* babble/* > parents.txt #cat bloomberg/* ceo/* chiefexecutive/* cio/* forbes/* foreignaffairs/* hbr/* inc/* managementexchange/* n2growth/* real-leaders/* strategy-business/* > manager.txt # cat bloomberg/1* >> manager.txt #https://www.kaggle.com/arunava21/word2vec-and-random-forest-classification storepathData= "/home/unaiqueFRAMEWORK/data" storepath = "/home/unaiqueFRAMEWORK/text_classify/training_data" managerDB = storepath+"/manager_train.txt" parentsDB = storepath+"/parents_train.txt" teenagerDB = storepath+"/teenager_train.txt" manager = {"bloomberg":'bloomberg.com', "cio":'cio.com', "hbr":'hbr.org', "forbes":'forbes.com', 'foreignaffairs':'foreignaffairs.com', 'ceo':'ceo.com','chiefexecutive':'chiefexecutive.net', 'economist':'economist.com', 'strategy-business':'strategy-business.com','managementexchange':'managementexchange.com', 'real-leaders':'real-leaders.com', 'inc':'inc.com', 'n2growth':'n2growth.com', "wsj" : 'wsj.com'} parents = {"babble" : 'babble.com', "alphamom":'alphamom.com'} teenager = {"teenink" : 'teenink.com', 'teenreads':'teenreads.com','girlscouts':'girlscouts.org','teenreads':'teenreads.com','teenkidsnews.com':'teenkidsnews','teensource':'teensource.org','teenvogue':'teenvogue.com','teenspeak':'teenspeak.org','teensgotcents':'teensgotcents.com','theteenagertoday':'theteenagertoday.com'} def encodeToLatin1(text): #text = text.replace('ß','ss') encResults = text.encode('utf-8', "ignore") #encResults = text.encode('utf-8', "ignore") s_string = str(encResults.decode('latin-1', "ignore")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string def encodeToUTF8Adv(text): encResults = text.encode('utf-8', "ignore") #return str(encResults.decode('latin-1', "ignore")) s_string = str(encResults.decode('utf-8', "remove")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string def encodeToUTF8(text): return text.encode('utf-8', "ignore") def transformText(text): # https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#stanfordcorenlplemmatization stops = set(stopwords.words("english")) # Convert text to lower text = text.lower() # Removing non ASCII chars text = re.sub(r'[^\x00-\x7f]',r' ',text) # Strip multiple whitespaces text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) # Removing all the stopwords filtered_words = [word for word in text.split() if word not in stops] # Removing all the tokens with lesser than 3 characters filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3) # Preprocessed text after stop words removal text = " ".join(filtered_words) # Remove the punctuation text = gensim.parsing.preprocessing.strip_punctuation2(text) # Strip all the numerics text = gensim.parsing.preprocessing.strip_numeric(text) # Strip multiple whitespaces text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text) # Stemming #return gensim.parsing.preprocessing.stem_text(text) # Lemmatize t = [wd.decode('utf-8').split('/')[0] for wd in gensim.utils.lemmatize(text)] return str(" ".join(t)) """ MAIN """ for key in manager: #print (key) # key #print (manager[key]) # value mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #print(f) with codecs.open(f, 'r') as fr: data = fr.read().replace('\n', '').strip() fr.close() tra = transformText(data) with codecs.open(managerDB, 'a+') as fa: fa.write("manager###"+tra+"\n") fa.close() for key in teenager: #print (key) # key #print (manager[key]) # value mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #print(f) with codecs.open(f, 'r') as fr: data = fr.read().replace('\n', '').strip() fr.close() tra = transformText(data) with codecs.open(teenagerDB, 'a+') as fa: fa.write("teenager###"+tra+"\n") fa.close() for key in parents: #print (key) # key #print (manager[key]) # value mystorepath = storepathData+"/"+key if os.path.exists(mystorepath): folders = list() files = list() for entry in os.scandir(mystorepath): if entry.is_dir(): folders.append(entry.path) elif entry.is_file(): files.append(entry.path) for f in files: #print(f) with codecs.open(f, 'r') as fr: data = fr.read().replace('\n', '').strip() fr.close() tra = transformText(data) with codecs.open(parentsDB, 'a+') as fa: fa.write("parents###"+tra+"\n") fa.close() exit(1)