import glob import re import codecs import sys import os import html2text # pip3 install html2text """ root@v22020089423124746:/home/seo-auto-scaler/data# python3 extractor_dreizweieins.py && python3 extractor_ecommerce-vision.py && python3 extractor_seokratie.py && python3 extractor_seonative.py && python3 extractor_seotrainee.py """ def find_between( s, first, last ): try: start = s.index( first ) + len( first ) end = s.index( last, start ) return s[start:end] except ValueError: return "" def find_between_r( s, first, last ): try: start = s.rindex( first ) + len( first ) end = s.rindex( last, start ) return s[start:end] except ValueError: return "" def find_between2(s, start, end): return (s.split(start))[1].split(end)[0] h = html2text.HTML2Text() h.ignore_links = True year = ['2017','2018','2019','2020','2021'] #final="/home/seo-auto-scaler/version3/extractor/ecommerce-vision.txt" final="/home/seo-auto-scaler/data/data.txt" #root_dir="/home/crawling_seoautoscaler/www.dreizweieins.ch/2020/03/16/brand-storytelling-warum-marken-eine-geschichte-erzählen-müssen/" root_dir="/home/crawling_seoautoscaler/www.ecommerce-vision.de/" for filename in glob.iglob(root_dir + '**/**', recursive=True): print("Working on: "+filename) if os.path.isfile(filename) and not "feed" in filename and not "page" in filename and not "tag" in filename:# and any(word in filename for word in year): html_content = str() with codecs.open(filename, 'r', encoding='utf8') as f: try: html_content = f.read() #print(html_content) f.close() except Exception as e1: pass html = html_content.split("\n") inRecordingMode = False c = str() for ele in html: if "
" in ele: #'
' in ele: inRecordingMode = True #print(inRecordingMode) #exit() elif "ratingblock" in ele: #'Bei weiteren Fragen zum Zusammenspiel von SEO' in ele: inRecodingMode = False #print(inRecordingMode) if inRecordingMode: c = c + ele c = "".join(c) s = str() try: cc=find_between2(c, "articleBody", "VN:F") s=h.handle(cc) s=s.replace("l?token=","") s=s.replace("=>","") except Exception as e1: pass if len(s) > 1500: l = s.split("\n") #with codecs.open(final, 'a+', encoding='utf8') as f: # f.write("__label__VISION\n") # f.close() for ele in l: if "http" not in ele and "wp-content" not in ele and "uploads" not in ele: #if not ele.isspace() and len(ele) > 25: if len(ele) > 15 and "http" not in ele: with codecs.open(final, 'a+', encoding='utf8') as f: #f.write(c[:-165]) ele = ele.replace("*","") ele = ele.replace("\\","") ele = ele.replace(">","") ele = ele.replace("=","") ele = ele.replace("#","") f.write("__label__VISION "+ele+"\n") f.close()