import glob
import re
import codecs
import sys
import os
import html2text	# pip3 install html2text

def find_between( s, first, last ):
	try:
		start = s.index( first ) + len( first )
		end = s.index( last, start )
		return s[start:end]
	except ValueError:
		return ""

def find_between_r( s, first, last ):
	try:
		start = s.rindex( first ) + len( first )
		end = s.rindex( last, start )
		return s[start:end]
	except ValueError:
		return ""

def find_between2(s, start, end):
	  return (s.split(start))[1].split(end)[0]

h = html2text.HTML2Text()
h.ignore_links = True

year = ['2017','2018','2019','2020','2021']
#final="/home/seo-auto-scaler/version3/extractor/seo-trainee.txt"
final="/home/seo-auto-scaler/data/data.txt"
#root_dir="/home/crawling_seoautoscaler/www.dreizweieins.ch/2020/03/16/brand-storytelling-warum-marken-eine-geschichte-erzählen-müssen/"
root_dir="/home/crawling_seoautoscaler/www.seo-trainee.de/"
for filename in glob.iglob(root_dir + '**/**', recursive=True):
	print("Working on: "+filename)
	if os.path.isfile(filename) and not "feed" in filename and not "page" in filename and not "tag" in filename:# and any(word in filename for word in year):
		html_content = str()
		with codecs.open(filename, 'r', encoding='utf8') as f:
			try:
				html_content = f.read()
				#print(html_content)
				f.close()
			except Exception as e1:
				pass

			html 				= html_content.split("\n")
			inRecordingMode = False
			c 				= str()
			for ele in html:
				if "amp-wp-article-content" in ele:			#'<div class="single-content-wrapper">' in ele:
					inRecordingMode = True
					#print(inRecordingMode)
					#exit()
				elif "amp-wp-article-footer" in ele:	#'Bei weiteren Fragen zum Zusammenspiel von SEO' in ele:
					inRecodingMode = False
					#print(inRecordingMode)
				if inRecordingMode:
					c = c + ele

			c = "".join(c)
			s = str()

			try:
				cc=find_between_r(html_content, "amp-wp-article-content", "amp-wp-article-footer")
				s=h.handle(cc)
				s=s.replace("l?token=","")
				s=s.replace("=>","")
			except Exception as e1:
				pass
			#print(cc)
			#exit()
			if len(s) > 1500:
				l = s.split("\n")
				#with codecs.open(final, 'a+', encoding='utf8') as f:
				#	f.write("__label__TRAINEE\n")
				#	f.close()
				for ele in l:
					if "http" not in ele and "wp-content" not in ele and "uploads" not in ele and "image" not in ele:
						if len(ele) > 15 and "http" not in ele:
							with codecs.open(final, 'a+', encoding='utf8') as f:
								ele = ele.replace("*","")
								ele = ele.replace("/","")
								ele = ele.replace(">","")
								ele = ele.replace("=","")
								ele = ele.replace("#","")
								ele = ele.replace("\\","")
								f.write("__label__TRAINEE "+ele+"\n")
								f.close()