"""
Copyright (c) 2023, Sebastian Enger, M.Sc.
All rights reserved.
This source code is licensed under the BSD-style license found in the
LICENSE file in the root directory of this source tree (BSD-4-Clause).
Frontend and Backend Source Code for Project:
- https://www.artikelschreiber.com/
- https://www.artikelschreiben.com/
- https://www.unaique.net/
"""
import logging
logging.getLogger().disabled = True
logging.disable(logging.WARNING)
logging.disable(logging.INFO)
import urllib.parse
import sys
sys.path.append('/home/unaique/library3')
#sys.path.append('/home/unaique/library')
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
"""
Fix SSL Problems:
# https://gankrin.org/how-to-fix-ssl-certificate_verify_failed-error-error-in-python/
$ sudo update-ca-certificates --fresh
$ export SSL_CERT_DIR=/etc/ssl/certs
"""
# pip3 install -U aiohttp asyncio
# https://www.twilio.com/blog/-asynchrone-http-anforderungen-python-aiohttp-asyncio
#import aiohttp
#import asyncio
import requests # pip3 install --upgrade requests
import bleach #https://pypi.org/project/bleach/
from dns import resolver,reversename
import json
import codecs
import string
from string import printable
from unidecode import unidecode
import re
import random
import os
import socket
from bs4 import BeautifulSoup, Comment
import extruct # https://github.com/scrapinghub/extruct /// pip install -U extruct
from w3lib.html import get_base_url # https://github.com/scrapy/w3lib /// pip install -U w3lib
import regex as re1 # pip3 install -U regex
from urllib.parse import urljoin
from urllib.parse import urlencode
from urllib.parse import urlparse
import urllib3
urllib3.disable_warnings()
from newspaper import Article
#import newspaper #pip3 install -U newspaper3k
article = Article('')
# compile regexes, so we don't do this on the fly and rely on caching
#from typing import Any, Dict, Pattern
from typing import Pattern
from langdetect import detect # also https://github.com/saffsd/langid.py - pip3 install -U langdetect
import langid # pip3 install -U langid
from langua import Predict # pip3 install -U langua https://github.com/whiletruelearn/langua
import pycld2 as cld2 # pip install -U pycld2
#from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser # pip3 install -U sumy
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.summarizers.reduction import ReductionSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from html import unescape
import geoip2.database
geoReader = geoip2.database.Reader("/home/unaique/library3/geoip/GeoLite2-City.mmdb")
# https://github.com/P3TERX/GeoLite.mmdb - Downloads
#https://geoip2.readthedocs.io/en/latest/
# pip3 install -U geoip2
# https://download.maxmind.com/app/geoip_download?edition_id=GeoLite2-City&license_key=############&suffix=tar.gz
# Account/User ID ###########
# License key ################
# pip3 install -U IP2Location
#https://www.ip2location.com/development-libraries/ip2location/python
#https://github.com/chrislim2888/IP2Location-Python
# Web: https://lite.ip2location.com/ - User: Sebastian@rechthaben.net - Pass: zVkXkqH6v+8SG@$
# Direct Download: https://www.ip2location.com/download/?token=##############&file=DB11LITEBIN
# Free Download: Sources: https://download.ip2location.com/lite/
import IP2Location
database = IP2Location.IP2Location(os.path.join("data", "/home/unaique/library3/geoip/IP2LOCATION-LITE-DB11.BIN"))
import unicodedata # pip3 install -U unidecode
import psutil # pip3 install -U psutil https://www.geeksforgeeks.org/how-to-get-current-cpu-and-ram-usage-in-python/
import datetime as dt
from html5validate import validate
#pip3 install -U html5lib
from lxml import etree
from io import StringIO
# https://hackersandslackers.com/scrape-metadata-json-ld/
# https://hackersandslackers.com/scraping-urls-with-beautifulsoup/
# https://github.com/hackersandslackers/beautifulsoup-tutorial
# https://github.com/practical-data-science/ecommercetools
# https://github.com/mozilla/bleach
# next and previous tags: https://stackoverflow.com/questions/53437616/beautifulsoup-find-elements-directly-below-and-above-heading-with-specific-str
"""
from AIlify import AI
myAI = AI()
#print(type(myAI))
#sys.exit(1)
"""
"""
# Start: Globale Variablen für Rankify
headline_min_length = 7 # chars
min_word_count = 7 # words
min_text_length = 140 # chars
max_spaces_count = 5 # ' '
headlines_list = ["h1","h2","h3","h4","h5","h6"]
# Ende: Globale Variablen für Rankify
"""
VALUESERP_KEY = '########################'
punct = set(string.punctuation)
UserAgent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
UserAgentMobile = "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_2 like Mac OS X) AppleWebKit/603.2.4 (KHTML, like Gecko) FxiOS/7.5b3349 Mobile/14F89 Safari/603.2.4"#"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
Headers = {'user-agent': UserAgent, 'Content-Type': 'text/html; charset=utf-8', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate'}
#HeadersSimple = {'user-agent': UserAgentMobile, 'Content-Type': 'text/html; charset=utf-8', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate'}
HeadersSimple = {'user-agent': UserAgentMobile, 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate'}
HeadersSimpleADV = {'user-agent': UserAgentMobile, 'Authorization': 'ce78143f444846d14d338f0da26a2434', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate'}
stoplist_domains = "/home/unaique/library3/blacklists/adblock.txt" # Quelle: https://github.com/carlospolop/MalwareWorld und https://malwareworld.com/textlists/suspiciousDomains.txt
stoplist_shops = "/home/unaique/library3/blacklists/shops.txt"
stoplist_paywall = "/home/unaique/library3/blacklists/paywall.txt"
stoplist_porn = "/home/unaique/library3/blacklists/pornstopwordlist.txt"
stoplist_captcha = "/home/unaique/library3/blacklists/captcha.txt"
f_domains = open(stoplist_domains, "r", encoding='utf-8')
f_shops = open(stoplist_shops, "r", encoding='utf-8')
f_paywall = open(stoplist_paywall, "r", encoding='utf-8')
f_porn = open(stoplist_porn, "r", encoding='utf-8')
f_captcha = open(stoplist_captcha, "r", encoding='utf-8')
"""
stoplist_domains_blocker = list(set(f_domains.readlines()))
stoplist_shops_blocker = list(set(f_shops.readlines()))
stoplist_paywall_blocker = list(set(f_paywall.readlines()))
stoplist_porn_blocker = list(set(f_porn.readlines()))
stoplist_captcha_blocker = list(set(f_captcha.readlines()))
"""
RE_LINEBREAK: Pattern = re.compile(r"(\r\n|[\n\v])+")
RE_NONBREAKING_SPACE: Pattern = re.compile(r"[^\S\n\v]+")
RE_ZWSP: Pattern = re.compile(r"[\u200B\u2060\uFEFF]+")
RE_BRACKETS_CURLY = re.compile(r"\{[^{}]*?\}")
RE_BRACKETS_ROUND = re.compile(r"\([^()]*?\)")
RE_BRACKETS_SQUARE = re.compile(r"\[[^\[\]]*?\]")
RE_BULLET_POINTS = re.compile(
# require bullet points as first non-whitespace char on a new line, like a list
r"((^|\n)\s*?)"
r"([\u2022\u2023\u2043\u204C\u204D\u2219\u25aa\u25CF\u25E6\u29BE\u29BF\u30fb])",
)
# source: https://gist.github.com/dperini/729294
RE_URL: Pattern = re.compile(
r"(?:^|(?= 224.0.0.0
# excludes network & broadcast addresses
# (first & last IP address of each class)
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
r"|"
# host name
r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
# domain name
r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
# TLD identifier
r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
r")"
# port number
r"(?::\d{2,5})?"
# resource path
r"(?:/\S*)?"
r"(?:$|(?![\w?!+&/]))",
flags=re.IGNORECASE,
)
RE_SHORT_URL: Pattern = re.compile(
r"(?:^|(?= 1:
print("String contains {} Special Character/s ".format(special_char))
else:
print("There are no Special Characters in this String.")
"""
return special_char
def doSerpSearch(self, MainKeyword, SubKeywords, language, ip_obj):
search_query = str(MainKeyword+", "+SubKeywords)
isYoutubeFlag = False
gl_google_country = {
# https://www.valueserp.com/docs/search-api/reference/google-countries
# ascom internal language shortcode: google country equivalent
"de": "de",
"en": "us",
"es": "es",
"fr": "fr",
"it": "it",
"ru": "ru",
"cn": "cn",
"zh": "cn",
"jp": "jp",
"ja": "jp",
"pt": "pt",
"in": "in",
"hi": "in",
"sa": "sa",
"ar": "sa",
"tr": "tr",
}
gl_google_language = {
# https://www.valueserp.com/docs/search-api/reference/google-countries
# ascom internal language shortcode: google country equivalent
"de": "lang_de",
"en": "lang_en",
"es": "lang_es",
"fr": "lang_fr",
"it": "lang_it",
"ru": "lang_ru",
"cn": "lang_zh-tw",
"zh": "lang_zh-tw",
"jp": "lang_ja",
"ja": "lang_ja",
"pt": "lang_pt",
"in": "lang_en",
"hi": "lang_en",
"sa": "lang_ar",
"ar": "lang_ar",
"tr": "lang_tr",
}
"""
The gl parameter determines the Google country to use for the query. View the full list of supported glvalues https://www.valueserp.com/docs/search-api/reference/google-countries. Defaults to us.
The hl parameter determines the Google UI language to return results. View the full list of supported hlvalues https://www.valueserp.com/docs/search-api/reference/google-languages. Defaults to en.
"""
params = {
'api_key': VALUESERP_KEY,
'flatten_results': True,
#'include_answer_box': True, # flatten_results und include_answer_box funktionieren beide nicht zusammen
#'include_advertiser_info': True, # dies kostet 1 Creditpunkt mehr! somit 2 statt nur 1 beim weglassen dieser include_advertiser_info option
'q': search_query,
'gl': gl_google_country[language],
'hl': 'de',
'num': '75', #'100',
'location': "lat:"+ip_obj.get("ip_latitude")+",lon:"+ip_obj.get("ip_longitude"),
'safe': 'active',# or enable safe search: 'active', disable safe mode: "off"
'lr': gl_google_language[language] # https://www.valueserp.com/docs/search-api/reference/google-lr-languages
}
if isYoutubeFlag:
params['search_type'] = 'videos'
myteler = dict()
youtube_videos = list()
related_searches = list()
related_questions = list()
organic_results = list()
answer_box = list()
isSuccess = False
api_result_content = str("")
try:
api_result = requests.get('https://api.valueserp.com/search', params, timeout=300)
if api_result.status_code == 200:
api_result_content = api_result.json()
isSuccess = api_result_content.get("request_info", False).get("success", False)
except Exception as as1:
pass
"""
r = codecs.open("/home/unaique/library3/ARCHIV/valueserp.json", 'r', encoding='utf-8')
api_result_content = json.loads(r.read())
isSuccess = True
r.close()
"""
myteler["serp"] = api_result_content # ist ein dict, muss mit print(json.dumps(myteler["serp"], indent=2)) zu json umgewandelt werden
if not isSuccess:
myteler["isSuccess"] = False
myteler["serp_youtube_videos"] = list()
myteler["serp_related_searches"] = list()
myteler["serp_faq_related_questions"] = list()
myteler["serp_organic_results"] = list()
myteler["serp_mainkeyword"] = str(MainKeyword)
myteler["serp_subkeywords"] = str(SubKeywords)
return myteler
elif isSuccess: # All options here: https://www.valueserp.com/docs/search-api/results/google/search
myteler["isSuccess"] = True
if isYoutubeFlag:
inline_videos_objs = api_result_content.get("video_results","")
for iv_obj in inline_videos_objs:
yt = dict()
yt_title = str(iv_obj.get("title",""))
yt_link = str(iv_obj.get("link",""))
#yt_date = str(iv_obj.get("date",""))
yt_date_utc = str(iv_obj.get("date_utc",""))
yt_snippet = str(iv_obj.get("snippet",""))
yt_image = str(iv_obj.get("image",""))
yt_length = str(iv_obj.get("length",""))
if len(yt_title) > 3 and len(yt_link) > 3 and yt_link.find("https://www.youtube.com/") != -1:
yt["youtube_video"] = yt_link
yt["youtube_title"] = yt_title
yt["youtube_date_utc"] = yt_date_utc
yt["youtube_snippet"] = yt_snippet
yt["youtube_length"] = yt_length
yt["youtube_image"] = yt_image
youtube_videos.append(yt)
return youtube_videos
else:
related_searches_objs = api_result_content.get("related_searches","")
for rs_obj in related_searches_objs:
rs = dict()
rs_title = str(rs_obj.get("query",""))
if len(rs_title) > 3:
rs["related_searches"] = rs_title
related_searches.append(rs)
related_questions_objs = api_result_content.get("related_questions","")
for rq_obj in related_questions_objs:
rq = dict()
rq_source_link = str("")
rq_source_title = str("")
try:
rq_source_link = str(rq_obj.get("source","").get("link",""))
rq_source_title = str(rq_obj.get("source","").get("title",""))
except Exception as as1:
pass
rq_question = str(rq_obj.get("question",""))
rq_answer = str(rq_obj.get("answer",""))
if len(rq_question) > 3 and len(rq_answer) > 3 and len(rq_source_title) > 3 and len(rq_source_link) > 3:
rq["faq_question"] = rq_question
rq["faq_answer"] = rq_answer
rq["faq_source_link"] = rq_source_link
rq["faq_source_title"] = rq_source_title
related_questions.append(rq)
organic_results_objs = api_result_content.get("organic_results","")
for or_obj in organic_results_objs:
or_d = dict()
or_position = str(or_obj.get("position",""))
or_title = str(or_obj.get("title",""))
or_link = str(or_obj.get("link",""))
or_snippet = str(or_obj.get("snippet",""))
or_cached_page_link = str(or_obj.get("cached_page_link","")) # https://webcache.googleusercontent.com/search?q=cache:or_4V5fHFHkJ:https://www.gruender.de/vertrieb/amazon-seo/&cd=12&hl=de&ct=clnk&gl=de&lr=lang_de&vwsrc=1
or_about = str("")
try:
or_about = str(or_obj.get("about_this_result","").get("your_search_and_this_result",""))
except Exception as as1:
pass
or_date = str(or_obj.get("date",""))
or_date_utc = str(or_obj.get("date_utc",""))
or_rich_snippet_rating = str("")
or_rich_snippet_reviews = str("")
try:
or_rich_snippet_rating = str(or_obj.get("rich_snippet","").get("top","").get("detected_extensions","").get("rating",int(0)))
or_rich_snippet_reviews = str(or_obj.get("rich_snippet","").get("top","").get("detected_extensions","").get("reviews",int(0)))
except Exception as as1:
pass
if (len(or_position) >= 1 and len(or_title) > 3 and len(or_link) > 3 and len(or_snippet) > 3) or (len(or_title) > 3 and len(or_link) > 3 and or_link.find("youtube.com/") != -1 or or_link.find("youtu.be/") != -1or or_link.find("http://") != -1 or or_link.find("https://") != -1):
isUrlBlacklisted = self.isDomainBlacklisted(or_link)
if not isUrlBlacklisted:
or_d["organic_results_position"] = or_position
or_d["organic_results_title"] = or_title
or_d["organic_results_link"] = or_link
or_d["organic_results_snippet"] = or_snippet
or_d["organic_results_cached_page"] = or_cached_page_link
or_d["organic_results_about_list"] = or_about
or_d["organic_results_date"] = or_date
or_d["organic_results_date_utc"] = or_date_utc
or_d["organic_results_rich_snippet_rating"] = or_rich_snippet_rating
or_d["organic_results_rich_snippet_reviews"] = or_rich_snippet_reviews
organic_results.append(or_d)
inline_videos_objs = api_result_content.get("inline_videos","")
for iv_obj in inline_videos_objs:
yt = dict()
yt_title = str(iv_obj.get("title",""))
yt_link = str(iv_obj.get("link",""))
yt_date_utc = str(iv_obj.get("date_utc",""))
yt_snippet = str(iv_obj.get("snippet",""))
yt_image = str(iv_obj.get("image",""))
yt_length = str(iv_obj.get("length",""))
# Source: https://gist.github.com/rodrigoborgesdeoliveira/987683cfbfcc8d800192da1e73adc486
if len(yt_title) > 3 and len(yt_link) > 3 and ((yt_link.find("youtube.com/") != -1 and yt_link.find("watch?v=") != -1) or yt_link.find("youtu.be") != -1 or yt_link.find("youtube.com/embed/") != -1 or yt_link.find("youtube-nocookie.com/embed/") != -1):
yt["youtube_video"] = yt_link
yt["youtube_title"] = yt_title
yt["youtube_date_utc"] = yt_date_utc
yt["youtube_snippet"] = yt_snippet
yt["youtube_length"] = yt_length
yt["youtube_image"] = yt_image
youtube_videos.append(yt)
if len(youtube_videos) < 1:
for my_obj in organic_results:
yt = dict()
yt_link = str(my_obj.get("organic_results_link"))
my_content_title = str(my_obj.get("organic_results_title"))
if len(yt_link) > 3 and ((yt_link.find("youtube.com/") != -1 and yt_link.find("watch?v=") != -1) or yt_link.find("youtu.be") != -1 or yt_link.find("youtube.com/embed/") != -1 or yt_link.find("youtube-nocookie.com/embed/") != -1):
yt["youtube_video"] = yt_link
yt["youtube_title"] = my_content_title
yt["youtube_date_utc"] = str("")
yt["youtube_snippet"] = str("")
yt["youtube_length"] = str("")
yt["youtube_image"] = str("")
youtube_videos.append(yt)
myteler["serp_youtube_videos"] = youtube_videos
myteler["serp_related_searches"] = related_searches
myteler["serp_faq_related_questions"] = related_questions
myteler["serp_organic_results"] = organic_results
myteler["serp_mainkeyword"] = str(MainKeyword)
myteler["serp_subkeywords"] = str(SubKeywords)
return myteler
def is_url(self, url):
try:
result = urlparse(url)
return all([result.path])
except ValueError:
return False
def validateHTML(self, html):
"""
import sys
# https://github.com/danthedeckie/html5validate
sys.path.append('/home/unaique/library')
from html5validate import validate
#pip3 install -U html5lib
from lxml import etree
from io import StringIO
"""
try:
validate(html)
return True
except Exception as e1:
try:
parser = lxml.etree.HTMLParser(recover=False)
lxml.etree.parse(StringIO(html), parser)
return True
except Exception as e1:
return False
return False
# Ende: Globale Variablen für Rankify
def countBadChars(self, text):
if len(text) < 1:
return 0
text = str(text)
#return sum(not c.isalnum() and not c.isspace() for c in text)
return sum(c in string.punctuation for c in text)
def useFastmode(self, text):
# Load muss (!) zuerst, da er essentiell für den Server ist
# kein Fastmode während der Entwicklungsphase
isPorn = self.isPornBlacklisted(text)
if isPorn:
return True
load1, load5, load15 = psutil.getloadavg()
if load1 >= self.max_load_fastmode:
return True
hourOfDay = int(dt.datetime.now().hour)
if hourOfDay >= 8 and hourOfDay <= 16:# Stoßzeiten von 8 Uhr früh bis 16 Uhr Nachmittags, dort geringere Wahrscheinlichkeit für AI
random_number = random.randint(0, 100)
else:
random_number = random.randint(0, 75)
if random_number == 1: # bei 2,3,4,5 also bei 1/5 fünftel der Anfragen dürfen wir KEIN KI nutzen, wir nehmen KI nur, wenn der Wert "int(1)" kommt
return False # hier keine Else Schleife, weil wir noch den Load gucken können
#return False # keine Fastmode, sprich wir dürfen KI Modul benutzen
return True
def remove_accents(self, input_str):
nfkd_form = unicodedata.normalize('NFKD', input_str)
only_ascii = nfkd_form.encode('ASCII', 'ignore')
s_string = str(only_ascii.decode('ASCII', "ignore"))
return str(s_string)
def isIntent(self, text):
informative_intent = ['what','who','when','where','which','why','how']
transactional_intent = ['buy','order','purchase','cheap','price','discount','shop','sale','offer']
commercial_intent = ['best','top','review','comparison','compare','vs','versus','guide','ultimate']
custom_intent = ['google','facebook','youtube','twitter','instagram','ebay','microsoft','apple','amazon','whatsapp'] # https://majestic.com/reports/majestic-million
for element in informative_intent + transactional_intent + commercial_intent + custom_intent:
if hasMatch(text, element):
return True
return False
def getGeoInformationForLogging(self, ip):
if len(ip) < 7:
ip = "1.1.1.1"
try:
response = geoReader.city(ip)
rec = database.get_all(ip)
iso = str(response.country.iso_code)
cn = str(response.country.name)
ctn = str(response.city.name)
pc = str(response.postal.code)
lat = str(response.location.latitude)
lon = str(response.location.longitude)
region = str(response.subdivisions.most_specific.name)
#return (iso+","+cn+","+ctn+","+pc+","+lat+","+lon)
#return str(rec.country_short+","+rec.country_long+","+rec.region+","+rec.city+","+rec.zipcode+","+rec.latitude+","+rec.longitude+","+ip)
cn1 = str()
region1 = str()
cn1 = rec.country_long
region1 = response.city.name
if cn1 is None:
cn1 = str(response.country.name)
else:
cn1 = str(rec.country_long)
if region1 is None:
region1 = str(rec.region)
else:
region1 = str(response.city.name)
return str(cn1+" -> "+region1)
except Exception as a1:
pass
return str("iso"+","+"cn"+","+"region"+","+"ctn"+","+"pc"+","+"lat"+","+"lon"+","+"ip")
def getGeoInformation(self, ip):
if len(ip) < 7:
ip = "1.1.1.1"
try:
response = geoReader.city(ip)
iso = str(response.country.iso_code)
cn = str(response.country.name)
ctn = str(response.city.name)
pc = str(response.postal.code)
lat = str(response.location.latitude)
lon = str(response.location.longitude)
region = str(response.subdivisions.most_specific.name)
#return (iso+","+cn+","+ctn+","+pc+","+lat+","+lon)
return str(iso+","+cn+","+region+","+ctn+","+pc+","+lat+","+lon+","+ip)
except Exception as a1:
pass
return str("iso"+","+"cn"+","+"region"+","+"ctn"+","+"pc"+","+"lat"+","+"lon"+","+"ip")
def getGeoInformation2(self, ip):
if len(ip) < 7:
ip = "1.1.1.1"
try:
rec = database.get_all(ip)
return str(rec.country_short+","+rec.country_long+","+rec.region+","+rec.city+","+rec.zipcode+","+rec.latitude+","+rec.longitude+","+ip)
except Exception as as1:
pass
return str("iso"+","+"cn"+","+"region"+","+"ctn"+","+"pc"+","+"lat"+","+"lon"+","+"ip")
def getGeoInformationAdvanced(self, ip):
ip_obj = dict()
iso = str("")
cn = str("")
ctn = str("")
pc = str("")
lat = str("48.364727")
lon = str("10.900789")
region = str("")
iso2 = str("")
cn2 = str("")
ctn2 = str("")
pc2 = str("")
lat2 = str("")
lon2 = str("")
region2 = str("")
my_hostname = str("")
try:
my_hostname = str(socket.gethostbyaddr(ip)[0])
except Exception as a1:
pass
if len(my_hostname) < 1:
try:
my_hostname = str(socket.getnameinfo((ip, 0), 0)[0])
except Exception as a1:
pass
if len(my_hostname) < 1:
try:
addr = reversename.from_address(ip)
my_hostname = str(str(resolver.query(addr,"PTR")[0]))
except Exception as a1:
pass
if len(my_hostname) < 1:
my_hostname = str(ip)
ip_obj["ip_iso_code"] = iso
ip_obj["ip_city"] = ctn
ip_obj["ip_region"] = region
ip_obj["ip_country"] = cn
ip_obj["ip_postalcode"] = pc
ip_obj["ip_latitude"] = lat
ip_obj["ip_longitude"] = lon
ip_obj["ip_address"] = str(ip)
ip_obj["ip_hostname"] = str(my_hostname)
lat = str("") # Fallback, weil sonst in doSerpSearch() die LatLon Werte fehlen, wenn kein IP-to-Geo Auflösung funktioniert
lon = str("")
if len(ip) < 7:
return ip_obj
try:
rec = database.get_all(ip)
iso = str(rec.country_short)
cn = str(rec.country_long)
ctn = str(rec.city)
pc = str(rec.zipcode)
lat = str(rec.latitude)
lon = str(rec.longitude)
region = str(rec.region)
#return str(rec.country_short+","+rec.country_long+","+rec.region+","+rec.city+","+rec.zipcode+","+rec.latitude+","+rec.longitude+","+ip)
except Exception as as1:
pass
if len(iso) < 1 or len(cn) < 1 or len(region) < 1 or len(ctn) < 1 or len(pc) < 1 or len(lat) < 1 or len(lon) < 1:
try:
response = geoReader.city(ip)
iso2 = str(response.country.iso_code)
cn2 = str(response.country.name)
ctn2 = str(response.city.name)
pc2 = str(response.postal.code)
lat2 = str(response.location.latitude)
lon2 = str(response.location.longitude)
region2 = str(response.subdivisions.most_specific.name)
except Exception as a1:
pass
if len(iso) < 1:
iso = iso2
if len(cn) < 1:
cn = cn2
if len(region) < 1:
region = region2
if len(ctn) < 1:
ctn = ctn2
if len(pc) < 1:
pc = pc2
if len(lat) < 1:
lat = lat2
if len(lon) < 1:
lon = lon2
ip_obj["ip_iso_code"] = iso
ip_obj["ip_city"] = ctn
ip_obj["ip_region"] = region
ip_obj["ip_country"] = cn
ip_obj["ip_postalcode"] = pc
ip_obj["ip_latitude"] = lat
ip_obj["ip_longitude"] = lon
ip_obj["ip_address"] = str(ip)
ip_obj["ip_hostname"] = str(my_hostname)
return ip_obj
def beautifyUpperLowercase(self, text):
text1=self._beautifyUpperLowercase(text)
text2=self._beautifyUpperLowercaseUmlauts(text1)
return text2
def _beautifyUpperLowercase(self, strString):
#https://pypi.python.org/pypi/regex/
#http://code.activestate.com/recipes/576984-split-a-string-on-capitalized-uppercase-char-using/
if not isinstance(strString, str):
strString=strString.decode("utf8")
#result = re.sub(u'(?<=[A-Za-z])(?=[A-Z][a-z])', '. ', strString, re.UNICODE)
result = re.sub(u'(?<=[A-Za-z])(?=[A-Z][a-z])', ' ', strString, re.UNICODE)
return result
def _beautifyUpperLowercaseUmlauts(self, strString):
if not isinstance(strString, str):
strString=strString.decode("utf8")
#result = re.sub(u'(?<=[a-z])(?=[ÜÖÄ])', '. ', strString, re.UNICODE)
result = re.sub(u'(?<=[a-z])(?=[ÜÖÄ])', ' ', strString, re.UNICODE)
return result
def doLsaSummarizer(self, text):
# pip3 install -U sumy
# https://github.com/miso-belica/sumy/tree/main/sumy/data/stopwords
Language = self.detectTextLanguage(text)
if "en" in Language.lower() or "en" == Language.lower():
LANGUAGE = "english"
elif "fr" in Language.lower() or "fr" == Language.lower():
LANGUAGE = "french"
elif "es" in Language.lower() or "es" == Language.lower():
LANGUAGE = "spanish"
elif "it" in Language.lower() or "it" == Language.lower():
LANGUAGE = "italian"
elif "ru" in Language.lower() or "ru" == Language.lower():
LANGUAGE = "slovak"
elif "zh" in Language.lower() or "zh" == Language.lower() or "cn" in Language.lower() or "cn" == Language.lower():
LANGUAGE = "chinese"
elif "de" in Language.lower() or "de" == Language.lower():
LANGUAGE = "german"
elif "pt" in Language.lower() or "pt" == Language.lower():
LANGUAGE = "portuguese"
elif "jp" in Language.lower() or "jp" == Language.lower():
LANGUAGE = "japanese"
elif "hi" in Language.lower() or "hi" == Language.lower() or "in" in Language.lower() or "in" == Language.lower():
LANGUAGE = "hindi"
hindi = self.split_sentences(text)
if len(hindi) >= 3:
return str(" ".join(hindi[:3]))
else:
return str(" ".join(hindi))
elif "ar" in Language.lower() or "ar" == Language.lower() or "sa" in Language.lower() or "sa" == Language.lower():
LANGUAGE = "arabic"
hindi = self.split_sentences(text)
if len(hindi) >= 3:
return str(" ".join(hindi[:3]))
else:
return str(" ".join(hindi))
else:
LANGUAGE = "english"
SENTENCES_COUNT = 3
text = self.beautifyUpperLowercase(text)
parser = PlaintextParser.from_string(text,Tokenizer(LANGUAGE))
# or for plain text files
# parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
summarizer.null_words = get_stop_words(LANGUAGE)
contentText = str("")
sent_list = list()
s_count = 0
for sentence in summarizer(parser.document, SENTENCES_COUNT):
if s_count <= SENTENCES_COUNT:
s_sent = str(sentence)
#contentText=contentText+s_sent+" "
sent_list.append(s_sent)
s_count+=1
return str(" ".join(sent_list))
def detectTextLanguage0(self, text): # https://github.com/whiletruelearn/langua
#langT =str("en")
text =str(text)
pLangDetect = Predict()
lang = str()
try:
lang = pLangDetect.get_lang(text)
except Exception as e:
pass#print("Language Detection failed: ",e)
if lang in ['de','en','es','it','fr','ru','zh-cn','ja','pt','hi','ar','tr']:
if lang == 'ja':
return 'jp'
if lang in 'zh-cn':
return 'cn'
if lang == 'hi':
return 'in'
if lang == 'ar':
return 'sa'
if lang == 'zh':
return 'cn'
return lang.lower()
return str("")
def detectTextLanguage1(self, text):
#langT =str("en")
text = str(text)
lang = str()
try:
lang = detect(text)
except Exception as e:
pass#print("Language Detection failed: ",e)
if lang in ['de','en','es','it','fr','ru','zh-cn','ja','pt','hi','ar','tr']:
if lang == 'zh-cn':
return 'cn'
if lang == 'ja':
return 'jp'
if lang == 'hi':
return 'in'
if lang == 'ar':
return 'sa'
if lang == 'zh':
return 'cn'
return lang.lower()
return str("")
def detectTextLanguage2(self, text): # https://github.com/saffsd/langid.py
lang = str()
#langT = str("en")
text = str(text)
langid.set_languages(['de','en','es','it','fr','ru','zh','ja','pt','hi','ar','tr'])
#try:
langTemp= langid.classify(text)
lang = langTemp[0].lower()
#if lang in ['de','en','es','it','fr']:
if lang in ['de','en','es','it','fr','ru','zh','ja','pt','hi','ar','tr']:
if lang == 'ja':
return 'jp'
if lang == 'hi':
return 'in'
if lang == 'ar':
return 'sa'
if lang == 'zh':
return 'cn'
return lang.lower()
#except Exception as e:
# pass#print("Language Detection failed: ",e)
return str("")
def detectTextLanguage3(self, text): # https://github.com/aboSamoor/pycld2/blob/master/cld2/internal/generated_language.h
#langT = str("en")
text = str(text)
lang = str()
try:
isReliable, textBytesFound, details = cld2.detect(
text, bestEffort=True
)
lang = str(details[0][1]).lower()
except Exception as e:
pass#print("Language Detection failed: ",e)
if lang in ['de','en','es','it','fr','ru','zh','ja','pt','hi','ar','tr']:
if lang == 'ja':
return 'jp'
if lang == 'hi':
return 'in'
if lang == 'ar':
return 'sa'
if lang == 'zh':
return 'cn'
return lang.lower()
return str("")
def detectTextLanguage(self, text):
# Language Detection via Fasttext: https://fasttext.cc/docs/en/language-identification.html
text = str(text)
languz0 = str()
languz1 = str()
languz2 = str()
languz3 = str()
resList = list()
try:
languz0 = self.detectTextLanguage0(text)
if len(languz0) == 2:
resList.append(str(languz0.lower()))
except Exception as as1:
pass
try:
languz1 = self.detectTextLanguage1(text)
if len(languz1) == 2:
resList.append(str(languz1.lower()))
except Exception as as1:
pass
try:
languz2 = self.detectTextLanguage2(text)
if len(languz2) == 2:
resList.append(str(languz2.lower()))
except Exception as as1:
pass
try:
languz3 = self.detectTextLanguage3(text)
if len(languz3) == 2:
resList.append(str(languz3.lower()))
except Exception as as1:
pass
return self.most_frequent(resList)
def most_frequent(self, List):
return max(set(List), key = List.count)
def fixBrokenHTMLEntities(self, text):
"""
a.) &39; (=DIGIT ONLY!)
b.) replace "&" mit ""
c.) html.unescape()
richtig ist : "'"
"""
###text = "hd&39;Hc&36;"
# match31 = '\&[a-z0-9A-Z]+\;'
text = str(text)
match37 = '\&[0-9]+\;' # &39;
regex37 = re.compile(match37, re.IGNORECASE)
match_obj = regex37.findall(text)
for elm in match_obj:
elm_org = elm.replace("&","")
elm_sub = unescape(elm_org)
text = text.replace(elm, elm_sub)
#print("elm_org",elm_org)
#print("elm_sub",elm_sub)
del match37
del regex37
del match_obj
text = str(text)
match37 = '\[0-9]+\;' # t'
regex37 = re.compile(match37, re.IGNORECASE)
match_obj = regex37.findall(text)
for elm in match_obj:
elm_sub = unescape(elm)
text = text.replace(elm, elm_sub)
#print("elm_org",elm_org)
#print("elm_sub",elm_sub)
return text
def isDomainBlacklisted(self, url):
#return self.isBlacklisted(url, self.stoplist_domains_blocker)
domain = str("")
try:
domain = urlparse(url).netloc
except Exception as as1:
pass
if domain in self.stopdict_domains_blocker:
return True
return False
def isPaywall(self, text):
# return True: Blackliste gefunden
# return False: Blackliste nicht gefunden
#stoplist = "/home/unaique/library/blacklists/paywall.txt"
return self.isBlacklisted(url, self.stoplist_paywall_blocker)
def isPornBlacklisted(self, text):
return self.isBlacklisted(text, self.stoplist_porn_blocker)
def split_sentences(self, text):
text = " " + text + " "
text = text.replace("\n"," ")
text = re.sub(prefixes,"\\1",text)
text = re.sub(websites,"\\1",text)
if "Ph.D" in text: text = text.replace("Ph.D.","PhD")
text = re.sub("\s" + alphabets + "[.] "," \\1 ",text)
text = re.sub(acronyms+" "+starters,"\\1 \\2",text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1\\2\\3",text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1\\2",text)
text = re.sub(" "+suffixes+"[.] "+starters," \\1 \\2",text)
text = re.sub(" "+suffixes+"[.]"," \\1",text)
text = re.sub(" " + alphabets + "[.]"," \\1",text)
if "”" in text: text = text.replace(".”","”.")
if "\"" in text: text = text.replace(".\"","\".")
if "!" in text: text = text.replace("!\"","\"!")
if "?" in text: text = text.replace("?\"","\"?")
text = text.replace(".",".")
text = text.replace("?","?")
text = text.replace("!","!")
text = text.replace("",".")
sentences = text.split("")
sentences = sentences[:-1]
sentences = [s.strip() for s in sentences]
return sentences
def getKnowledgeGraphMoreInfo(self, search, language):
link = "https://kgsearch.googleapis.com/v1/entities:search?key=#######&limit=1&languages="+str(language)+"&indent=False&query="+str(search)
response1 = self.getWebpage(link)
sentences = str("")
if self.is_json(response1):
try:
response = json.loads(response1)
for element in response['itemListElement']:
sentences = element['result']['detailedDescription']['articleBody']
except Exception as a1:
pass
sents = self.split_sentences(sentences)
sentences_c1 = sentences.count('.')
sentences_c2 = sentences.count('!')
sentences_c3 = sentences.count('?')
if len(sents) > 0:
return str(sents[0]) #sentences.split('.')[0]
elif sentences_c1 == 1 or sentences_c2 == 1 or sentences_c3 == 1:
return str(sentences)
elif len(sentences) > 10:
return str(sentences)
return str("")
def is_json(self, myjson):
try:
json.loads(myjson)
except ValueError as e:
return False
return True
def BingAPI(self, MainKeyword, SubKeywords, Language, enableFastmode):
enableFastmode = self.useFastmode(MainKeyword+" "+SubKeywords)
res_set = set()
addSearchMK = str()
addSearchSK = str()
maxCounter = int(10) # Quant gibt maximal 10 results her
if enableFastmode:
maxCounter = int(5)
mk_words = self.count_words_regex(MainKeyword)
sk_words = self.count_words_regex(SubKeywords)
"""
if (MainKeyword.count(" ") == 0 or mk_words == 0) and not enableFastmode:
addSearchMK = self.getKnowledgeGraphMoreInfo(MainKeyword, Language)
if (SubKeywords.count(" ") == 0 or sk_words == 0) and not enableFastmode:
addSearchSK = self.getKnowledgeGraphMoreInfo(SubKeywords, Language)
"""
if mk_words >= 10:
#addSearchSK_v2 = getKnowledgeGraphMoreInfo(SubKeywords, Language)
#query_string = str(SubKeywords+", "+addSearchSK_v2)
query_string = str(SubKeywords)
elif sk_words >= 10:
#addSearchMK_v2 = getKnowledgeGraphMoreInfo(MainKeyword, Language)
#query_string = str(MainKeyword+", "+addSearchMK_v2)
query_string = str(MainKeyword)
else:
#query_string = str(MainKeyword+", "+addSearchMK+", "+SubKeywords+", "+addSearchSK)
query_string = str(MainKeyword+", "+SubKeywords)
if "de" in Language.lower() or Language.lower() == "de":
lang = "de"
elif "en" in Language.lower() or Language.lower() == "en":
lang = "en"
elif "fr" in Language.lower() or Language.lower() == "fr":
lang = "fr"
elif "es" in Language.lower() or Language.lower() == "es":
lang = "es"
elif "it" in Language.lower() or Language.lower() == "it":
lang = "it"
elif "ru" in Language.lower() or Language.lower() == "ru":
lang = "ru"
elif "zh" in Language.lower() or Language.lower() == "zh" or "cn" in Language.lower() or Language.lower() == "cn":
lang = "CHT"
elif "pt" in Language.lower() or Language.lower() == "pt":
lang = "pt"
elif "jp" in Language.lower() or Language.lower() == "jp":
lang = "ja"
elif "hi" in Language.lower() or Language.lower() == "hi" or "in" in Language.lower() or Language.lower() == "in":
lang = "en"
elif "ar" in Language.lower() or Language.lower() == "ar" or "sa" in Language.lower() or Language.lower() == "sa":
lang = "en"
elif "tr" in Language.lower() or Language.lower() == "tr":
lang = "tr"
else:
lang = "en"
#try:
# https://github.com/searx/searx/tree/master/searx/engines
#link = "https://api.qwant.com/v3/search/web?q=Hundenahrung&count=10&offset=0&locale=de_de"
base_url = 'https://www.bing.com/'
search_string = 'search?{query}&first=0'
query = 'language:{} {}'.format(lang.upper(), query_string)
search_path = search_string.format(query=urlencode({'q': query}), offset=0)
website = base_url + search_path
#print(website)
html_page = self.getWebpage(website)
#print(html_page)
soup = BeautifulSoup(html_page, "lxml")
mcounter = int(0)
for link in soup.findAll('a'):
myLink = str(link.get('href'))
#myLink = str(link.text)
if self.is_url(myLink) and myLink.find("bing.com") == -1 and myLink.find("javascript:") == -1 and len(myLink) > 12 and not myLink.startswith('/') and (myLink.lower().startswith("http") or myLink.lower().startswith("https") ) and mcounter < maxCounter:
#print("Link:",myLink)
res_set.add(myLink)
mcounter += int(1)
#except Exception as a:
# #print("htmlify",a)
# pass
return res_set
def QuantAPI(self, MainKeyword, SubKeywords, Language, enableFastmode):
enableFastmode = self.useFastmode(MainKeyword+" "+SubKeywords)
res_set = set()
addSearchMK = str()
addSearchSK = str()
maxCounter = int(10) # Quant gibt maximal 10 results her
if enableFastmode:
maxCounter = int(5)
mk_words = self.count_words_regex(MainKeyword)
sk_words = self.count_words_regex(SubKeywords)
"""
if (MainKeyword.count(" ") == 0 or mk_words == 0) and not enableFastmode:
addSearchMK = self.getKnowledgeGraphMoreInfo(MainKeyword, Language)
if (SubKeywords.count(" ") == 0 or sk_words == 0) and not enableFastmode:
addSearchSK = self.getKnowledgeGraphMoreInfo(SubKeywords, Language)
"""
if mk_words >= 10:
#addSearchSK_v2 = getKnowledgeGraphMoreInfo(SubKeywords, Language)
#query_string = str(SubKeywords+", "+addSearchSK_v2)
query_string = str(SubKeywords)
elif sk_words >= 10:
#addSearchMK_v2 = getKnowledgeGraphMoreInfo(MainKeyword, Language)
#query_string = str(MainKeyword+", "+addSearchMK_v2)
query_string = str(MainKeyword)
else:
#query_string = str(MainKeyword+", "+addSearchMK+", "+SubKeywords+", "+addSearchSK)
query_string = str(MainKeyword+", "+SubKeywords)
if "de" in Language.lower() or Language.lower() == "de":
lang = "de_de"
elif "en" in Language.lower() or Language.lower() == "en":
lang = "en_us"
elif "fr" in Language.lower() or Language.lower() == "fr":
lang = "fr_fr"
elif "es" in Language.lower() or Language.lower() == "es":
lang = "es_es"
elif "it" in Language.lower() or Language.lower() == "it":
lang = "it_it"
elif "ru" in Language.lower() or Language.lower() == "ru":
lang = "us_en"
elif "zh" in Language.lower() or Language.lower() == "zh" or "cn" in Language.lower() or Language.lower() == "cn":
lang = "zh_cn"
elif "pt" in Language.lower() or Language.lower() == "pt":
lang = "pt_pt"
elif "jp" in Language.lower() or Language.lower() == "jp":
lang = "us_en"
elif "hi" in Language.lower() or Language.lower() == "hi" or "in" in Language.lower() or Language.lower() == "in":
lang = "us_en"
elif "ar" in Language.lower() or Language.lower() == "ar" or "sa" in Language.lower() or Language.lower() == "sa":
lang = "us_en"
elif "tr" in Language.lower() or Language.lower() == "tr":
lang = "us_en"
else:
lang = "us_en"
#try:
# https://github.com/searx/searx/tree/master/searx/engines
#link = "https://api.qwant.com/v3/search/web?q=Hundenahrung&count=10&offset=0&locale=de_de"
website = "https://api.qwant.com/v3/search/web?q="+query_string+"&count=10&offset=0&locale="+lang
#print(website)
html_page = self.getWebpage(website)
if self.is_json(html_page):
#print("isjson")
search_results = json.loads(html_page)
data = search_results.get('data', {})
mcounter = int(0)
if search_results.get('status') == 'success':
mainline = data.get('result', {}).get('items', {}).get('mainline', {})
if not mainline:
return res_set
for row in mainline:
mainline_type = row.get('type', 'web')
if mainline_type != 'web':
continue
mainline_items = row.get('items', [])
for item in mainline_items:
#title = item.get('title', None)
myLink = item.get('url', "")
#if not textify.isDomainBlacklisted(myLink) and len(myLink) > 12 and not myLink.lower().endswith(".pdf") and (myLink.lower().startswith("http") or myLink.lower().startswith("https") ):
#if len(myLink) > 12 and (myLink.lower().startswith("http") or myLink.lower().startswith("https") ) and mcounter < maxCounter:
#print("/api.qwant.com Adding Website:",str(myLink))
res_set.add(myLink)
mcounter += int(1)
#except Exception as a:
# #print("htmlify",a)
# pass
return res_set
def DuckduckgoAPI(self, MainKeyword, SubKeywords, Language, enableFastmode):
#print("DuckduckgoAPI")
res_set = set()
addSearchMK = str()
addSearchSK = str()
maxCounter = int(50)
if enableFastmode:
maxCounter = int(5)
mk_words = self.count_words_regex(MainKeyword)
sk_words = self.count_words_regex(SubKeywords)
if mk_words >= 10:
#addSearchSK_v2 = getKnowledgeGraphMoreInfo(SubKeywords, Language)
#query_string = str(SubKeywords+", "+addSearchSK_v2)
query_string = str(SubKeywords)
elif sk_words >= 10:
#addSearchMK_v2 = getKnowledgeGraphMoreInfo(MainKeyword, Language)
#query_string = str(MainKeyword+", "+addSearchMK_v2)
query_string = str(MainKeyword)
else:
#query_string = str(MainKeyword+", "+addSearchMK+", "+SubKeywords+", "+addSearchSK)
query_string = str(MainKeyword+", "+SubKeywords)
# https://api.duckduckgo.com/api
# https://duckduckgo.com/params
# https://help.duckduckgo.com/duckduckgo-help-pages/results/syntax/
if "de" in Language.lower() or Language.lower() == "de":
lang = "de-de"
elif "en" in Language.lower() or Language.lower() == "en":
lang = "us-en"
elif "fr" in Language.lower() or Language.lower() == "fr":
lang = "fr-fr"
elif "es" in Language.lower() or Language.lower() == "es":
lang = "es-es"
elif "it" in Language.lower() or Language.lower() == "it":
lang = "it-it"
elif "ru" in Language.lower() or Language.lower() == "ru":
lang = "ru-ru"
elif "zh" in Language.lower() or Language.lower() == "zh" or "cn" in Language.lower() or Language.lower() == "cn":
lang = "cn-zh"
elif "pt" in Language.lower() or Language.lower() == "pt":
lang = "pt-pt"
elif "jp" in Language.lower() or Language.lower() == "jp":
lang = "jp-jp"
elif "hi" in Language.lower() or Language.lower() == "hi" or "in" in Language.lower() or Language.lower() == "in":
lang = "in-en"
elif "ar" in Language.lower() or Language.lower() == "ar" or "sa" in Language.lower() or Language.lower() == "sa":
lang = "xa-ar"
elif "tr" in Language.lower() or Language.lower() == "tr":
lang = "tr-tr"
else:
lang = "us-en"
#print("DuckduckgoAPI fetch")
#try:
#website = "https://html.duckduckgo.com/html/?q="+query_string+"&lang="+lang
# https://help.duckduckgo.com/features/safe-search/ -> &kp=1 (strict), &kp=-1 (moderate - don't show explicit results), or &kp=-2 (off)
#website = "https://html.duckduckgo.com/html/?q="+query_string+"&lang="+lang+"&kl="+lang+"&kp=1&kaf=1&t=artikelschreiber.com" #"&lang=de-de&kl=de-de"
website = "https://duckduckgo.com/html?q="+query_string+"&kl="+lang+"&ia=web&kp=1&kaf=1&t=artikelschreiber.com"
html_page = self.getWebpage(website)
#print("DuckduckgoAPI html",website)
soup = BeautifulSoup(html_page, "lxml")
mcounter = int(0)
for link in soup.findAll('a', attrs={'class' : 'result__url'}):
#myLink = str(link.get('href'))
myLink = str(link.text)
#print("DuckduckgoAPI Link:",myLink)
if len(myLink) > 12 and not myLink.startswith('/') and (myLink.lower().startswith("http") or myLink.lower().startswith("https") ) and mcounter < maxCounter:
res_set.add(myLink)
mcounter += int(1)
#except Exception as a:
# #print("htmlify",a)
# pass
return res_set
def getWebpage(self, link):
#ftfy.fix_text(text, *, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC', max_decode_length=1000000)
if isinstance(link, str) and (link.lower().startswith("http") or link.lower().startswith("https")):
# use python request library for fetching
#print("getWebpage(self, link)",link)
try:
with requests.get(link, headers=HeadersSimple, timeout=60, verify=False, allow_redirects=True) as r1: #keep alive
#r1 = requests.get(link, headers=HeadersSimple, timeout=6, verify=False)
#r1.encoding = r1.apparent_encoding
#r1.encoding = 'utf-8'
#r1.encoding = 'latin-1'
myStatus = r1.status_code
myText = str(r1.text)
myContent = str(r1.headers['content-type'])
myText = myText.replace('\n', ' ')
myText = myText.replace("\n", ' ')
mT1 = myText.strip()
#print("getWebpage(self, link),Status Code:",myStatus," -> ",link)
if myStatus == 200:
#print("htmlify.getWebpage() Webpage size HTTP:", str(myStatus)," -> ", len(myText)," -> ", myContent)
return str(mT1)
else:
return str("")
except Exception as er:
#print("Unexpected error: getWebpage(link)", sys.exc_info()[0])
#exc_type, exc_obj, exc_tb = sys.exc_info()
#fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
#print(exc_type, fname, exc_tb.tb_lineno)
return str("")
pass
#print("htmlify.getWebpage(link): Empty HTML Document!")
return str("")
def isLowQualityContent(self, text):
# Um so höher die Zahl bzw. der Counter, um so schlechter der Score
mycounter = int(0)
for stopword in low_quality_content:
if self.isFound(text, stopword):
#print("Stopword found:",stopword)
mycounter += int(1)
#True, False
return mycounter
def isLowQualityContentText(self, text):
# Um so höher die Zahl bzw. der Counter, um so schlechter der Score
mycounter = False
for stopword in low_quality_content:
if self.isFound(text, stopword):
#print("Stopword found:",stopword)
mycounter = True # schlechte Ergebnisse, dh. geringwertiger Content
#True, False
return mycounter
def remove_control_characters1(self, s):
return re1.sub(r'\p{C}', '', s)
def isFound(self, text, stopword):
# return True: Stopwort in Text gefunden
# return False: Stopwort nicht in Text gefunden
#print("Text:'"+text+"' und Stopword:'"+stopword+"'")
t_text_org = str(text)
text = str(text.lower().strip("\n"))
text_len = len(text)
stopword = str(stopword).lower()
space_stopword = str(" ")+str(stopword)+str(" ")
#https://www.pythontutorial.net/python-regex/python-regex-word-boundary/
matches1 = re.finditer(r'\b'+stopword+'\b', text)
for match in matches1:
if match:
return True
if text.count(space_stopword) >= 1:
#if space_stopword.find(text) != -1:
#print("isBlacklisted() text.find() Match:",text,ele)
return True
if stopword == t_text_org.lower():
return True
if stopword in text:
return True
if "_" in text or "-" in text and text_len >= 2 and text_len < 6:
#print("Text:'"+text+"' und Stopword:'"+stopword+"'")
if text.find(stopword) != -1:
return True
return False
def get_dictionary_by_key_value(self, dictionary, target_key, target_value):
"""Return a dictionary that contains a target key value pair.
Args:
dictionary: Metadata dictionary containing lists of other dictionaries.
target_key: Target key to search for within a dictionary inside a list.
target_value: Target value to search for within a dictionary inside a list.
Returns:
target_dictionary: Target dictionary that contains target key value pair.
"""
for key in dictionary:
if len(dictionary[key]) > 0:
for item in dictionary[key]:
if item[target_key] == target_value:
return item
def get_title(self, html):
"""Scrape page title."""
#title = str("")
#import os, sys
try:
#if html.title.string:
# title = html.title.string
for title in html.find_all('title'):
return str(title.get_text())
except Exception as as1:
#print("Catching Exception at get_title(self, html):", e)
#print("Unexpected error:", sys.exc_info()[0])
#exc_type, exc_obj, exc_tb = sys.exc_info()
#fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
#print(exc_type, fname, exc_tb.tb_lineno)
pass
try:
if html.find("meta", property="og:title"):
description = html.find("meta", property="og:title").get('content')
return self.extractTextFromHTML(description)
except Exception as as1:
#print("Catching Exception at get_title(self, html):", e)
#print("Unexpected error:", sys.exc_info()[0])
#exc_type, exc_obj, exc_tb = sys.exc_info()
#fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
#print(exc_type, fname, exc_tb.tb_lineno)
pass
try:
if html.find("meta", property="twitter:title"):
description = html.find("meta", property="twitter:title").get('content')
return self.extractTextFromHTML(description)
except Exception as as1:
#print("Catching Exception at get_title(self, html):", e)
#print("Unexpected error:", sys.exc_info()[0])
#exc_type, exc_obj, exc_tb = sys.exc_info()
#fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
#print(exc_type, fname, exc_tb.tb_lineno)
pass
try:
if html.find("h1"):
title = html.find("h1").string
return title
except Exception as as1:
#print("Catching Exception at get_title(self, html):", e)
#print("Unexpected error:", sys.exc_info()[0])
#exc_type, exc_obj, exc_tb = sys.exc_info()
#fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
#print(exc_type, fname, exc_tb.tb_lineno)
pass
try:
if html.find_all("h1"):
title = html.find_all("h1")[0].string
return title
except Exception as as1:
#print("Catching Exception at get_title(self, html):", e)
#print("Unexpected error:", sys.exc_info()[0])
#exc_type, exc_obj, exc_tb = sys.exc_info()
#fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
#print(exc_type, fname, exc_tb.tb_lineno)
pass
try:
if title:
title = title.split('|')[0].strip()
#title = title.split('-')[0].strip()
except Exception as as1:
#print("Catching Exception at get_title(self, html):", e)
#print("Unexpected error:", sys.exc_info()[0])
#exc_type, exc_obj, exc_tb = sys.exc_info()
#fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
#print(exc_type, fname, exc_tb.tb_lineno)
pass
return str("")
def get_description(self, html):
#description = str("")
try:
if html.find("meta", property="og:description"):
description = html.find("meta", property="og:description").get('content')
return description
except Exception as as1:
pass
try:
if html.find("meta", property="description"):
description = html.find("meta", property="description").get('content')
return description
except Exception as as1:
pass
try:
if html.find("meta", property="twitter:description"):
description = html.find("meta", property="twitter:description").get('content')
return description
except Exception as as1:
pass
try:
if html.find("p"):
description = html.find("p").get('content')
return self.extractTextFromHTML(description)
except Exception as as1:
pass
return str("")
def get_image(self, html, web_url):
"""Scrape share image."""
#image = str("")
try:
if html.find("meta", property="image"):
image = html.find("meta", property="image").get('content')
li = urljoin(web_url, image)
return li
except Exception as as1:
pass
try:
if html.find("meta", property="og:image"):
image = html.find("meta", property="og:image").get('content')
li = urljoin(web_url, image)
return li
except Exception as as1:
pass
try:
if html.find("meta", property="twitter:image"):
image = html.find("meta", property="twitter:image").get('content')
li = urljoin(web_url, image)
return li
except Exception as as1:
pass
try:
if html.find_all("img", src=True):
image = html.find_all("img")
if image:
image = html.find_all("img")[0].get('src')
li = urljoin(web_url, image)
return li
except Exception as as1:
pass
return str("")
def get_video(self, html):
#og_video = str("")
try:
og_video = html.find("meta", property="og:video").get('content')
return og_video
#og_language2 = html.find("meta", attrs={'property':'og:locale'}).get('content')
except Exception as as1:
pass
return str("")
def get_meta_keywords(self, soup, text, url):
listKeywordBlogs = ["BlogPosting", "NewsArticle", "Article", "CreativeWork", "Organization","Product"]
# https://practicaldatascience.co.uk/data-science/how-to-scrape-schemaorg-metadata-using-python
try:
base_url = get_base_url(text, url)
metadata = extruct.extract(text,
base_url=base_url,
uniform=True,
syntaxes=['json-ld',
'microdata',
'opengraph'])
except Exception as as1:
pass
for element in listKeywordBlogs:
try:
kw = get_dictionary_by_key_value(metadata, "@type", element)
return str(kw['keywords'])
except Exception as as1:
pass
try:
kwords = str(soup.find("meta", attrs={'name':'keywords'}).get('content'))
if len(kwords) > 21:
return kwords
except Exception as as1:
pass
return str("")
def get_og_locale(self, html):
og_language = str("")
LANGUAGE = str("")
try:
og_language = html.find("meta", property="og:locale").get('content')
#og_language2 = html.find("meta", attrs={'property':'og:locale'}).get('content')
except Exception as as1:
pass
if self.isFound(og_language, "en"):
LANGUAGE = "en"
if self.isFound(og_language, "fr"):
LANGUAGE = "fr"
if self.isFound(og_language, "es"):
LANGUAGE = "es"
if self.isFound(og_language, "it"):
LANGUAGE = "it"
if self.isFound(og_language, "ar") or self.isFound(og_language, "sa"):
LANGUAGE = "ar"
if self.isFound(og_language, "tr"):
LANGUAGE = "tr"
if self.isFound(og_language, "ru"):
LANGUAGE = "ru"
if self.isFound(og_language, "zh") or self.isFound(og_language, "cn"):
LANGUAGE = "zh"
if self.isFound(og_language, "de"):
LANGUAGE = "de"
if self.isFound(og_language, "pt"):
LANGUAGE = "pt"
if self.isFound(og_language, "jp") or self.isFound(og_language, "ja"):
LANGUAGE = "ja"
if self.isFound(og_language, "in") or self.isFound(og_language, "hi"):
LANGUAGE = "in"
if len(LANGUAGE) == 2:
return str(LANGUAGE)
try:
#og_language1 = html.find("meta", property="og:locale").get('content')
og_language = html.find("meta", attrs={'property':'og:locale'}).get('content')
except Exception as as1:
pass
if self.isFound(og_language, "en"):
LANGUAGE = "en"
if self.isFound(og_language, "fr"):
LANGUAGE = "fr"
if self.isFound(og_language, "es"):
LANGUAGE = "es"
if self.isFound(og_language, "it"):
LANGUAGE = "it"
if self.isFound(og_language, "ar") or self.isFound(og_language, "sa"):
LANGUAGE = "ar"
if self.isFound(og_language, "tr"):
LANGUAGE = "tr"
if self.isFound(og_language, "ru"):
LANGUAGE = "ru"
if self.isFound(og_language, "zh") or self.isFound(og_language, "cn"):
LANGUAGE = "zh"
if self.isFound(og_language, "de"):
LANGUAGE = "de"
if self.isFound(og_language, "pt"):
LANGUAGE = "pt"
if self.isFound(og_language, "jp") or self.isFound(og_language, "ja"):
LANGUAGE = "ja"
if self.isFound(og_language, "in") or self.isFound(og_language, "hi"):
LANGUAGE = "in"
if len(LANGUAGE) == 2:
return str(LANGUAGE)
try:
#
#
og_language = html.find("html").attrs.get('lang')
except Exception as as1:
pass
if self.isFound(og_language, "en"):
LANGUAGE = "en"
if self.isFound(og_language, "fr"):
LANGUAGE = "fr"
if self.isFound(og_language, "es"):
LANGUAGE = "es"
if self.isFound(og_language, "it"):
LANGUAGE = "it"
if self.isFound(og_language, "ar") or self.isFound(og_language, "sa"):
LANGUAGE = "ar"
if self.isFound(og_language, "tr"):
LANGUAGE = "tr"
if self.isFound(og_language, "ru"):
LANGUAGE = "ru"
if self.isFound(og_language, "zh") or self.isFound(og_language, "cn"):
LANGUAGE = "zh"
if self.isFound(og_language, "de"):
LANGUAGE = "de"
if self.isFound(og_language, "pt"):
LANGUAGE = "pt"
if self.isFound(og_language, "jp") or self.isFound(og_language, "ja"):
LANGUAGE = "ja"
if self.isFound(og_language, "in") or self.isFound(og_language, "hi"):
LANGUAGE = "in"
if len(LANGUAGE) == 2:
return str(LANGUAGE)
return str("")
def get_links(self, html, web_url):
links = set()
for link in html.findAll('a'):
link_href = link.get('href')
if not self.isFound(link_href,"javascript"):
li = urljoin(web_url, link_href) # https://stackoverflow.com/questions/44001007/scrape-the-absolute-url-instead-of-a-relative-path-in-python
links.add(li)
return list(links)
def count_words_regex(self, text):
# Counting words with regular expressions -> https://datagy.io/python-count-words/#:~:text=%23-,Counting%20words,-with%20regular%20expressions
return len(re.findall(r'\w+', text))
# https://practicaldatascience.co.uk/data-science/how-to-identify-internal-and-external-links-using-python#:~:text=Determine%20whether%20the%20link%20is%20internal%20or%20external
def is_internal(self, url, domain):
if not bool(urlparse(url).netloc):
return True
elif url.startswith(domain):
return True
else:
return False #is_external
def isBlacklisted(self, text, stoplist):
# return True: Stopwort in Blackliste gefunden
# return False: Stopwort nicht in Blackliste gefunden
text = str(text.lower().strip("\n"))
for x in range(len(stoplist)):
ele2 = str(stoplist[x].lower().strip("\n"))
if not ele2.startswith("#") and len(ele2) >= 1:
# print("Ele:",ele2)
if self.isFound(text, ele2):
# print("Stopword isFound:'",ele2,"'")
return True
if ele2.find(text) != -1 or text.find(ele2) != -1:
# print("Stopword is -find-:'",ele2,"'")
return True
return False
def currency_symbols(self, text: str, repl: str = "_CUR_") -> str:
"""Replace all currency symbols in ``text`` with ``repl``."""
return RE_CURRENCY_SYMBOL.sub(repl, text)
def emails(self, text: str, repl: str = "_EMAIL_") -> str:
"""Replace all email addresses in ``text`` with ``repl``."""
return RE_EMAIL.sub(repl, text)
def emojis(self, text: str, repl: str = "_EMOJI_") -> str:
"""
Replace all emoji and pictographs in ``text`` with ``repl``.
Note:
If your Python has a narrow unicode build ("USC-2"), only dingbats
and miscellaneous symbols are replaced because Python isn't able
to represent the unicode data for things like emoticons. Sorry!
"""
return RE_EMOJI.sub(repl, text)
def hashtags(self, text: str, repl: str = "_TAG_") -> str:
"""Replace all hashtags in ``text`` with ``repl``."""
return RE_HASHTAG.sub(repl, text)
def numbers(self, text: str, repl: str = "_NUMBER_") -> str:
"""Replace all numbers in ``text`` with ``repl``."""
return RE_NUMBER.sub(repl, text)
def phone_numbers(self, text: str, repl: str = "_PHONE_") -> str:
"""Replace all phone numbers in ``text`` with ``repl``."""
return RE_PHONE_NUMBER.sub(repl, text)
def urls(self, text: str, repl: str = "_URL_") -> str:
"""Replace all URLs in ``text`` with ``repl``."""
return RE_SHORT_URL.sub(repl, RE_URL.sub(repl, text))
def user_handles(self, text: str, repl: str = "_USER_") -> str:
"""Replace all (Twitter-style) user handles in ``text`` with ``repl``."""
return RE_USER_HANDLE.sub(repl, text)
def preprocess_text(self, x):
x = str(x)
#x = currency_symbols(x, " ")
x = self.emails(x, " ")
x = self.phone_numbers(x, " ")
x = self.emojis(x, " ")
x = self.hashtags(x, " ")
x = self.user_handles(x, " ")
x = re.sub(r"\s{2,}", " ", x) # over spaces
"""
if isCode(x):
x = currency_symbols(x, "")
#x = urls(x, "")
#x = x.encode("ascii", "ignore").decode() # unicode
x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
x = ""
"""
return str(x)
def isCode(self, text):
myCode = ["=","{","}",":","|","$","_","(",")",">","<"]
myCount = 0
for c in myCode:
myCount += text.count(c)
if myCount > 3:
return True # BAD: We have JavaScript Code
return False # Good: We have NO JavaScript Code
def count_spaces(self, s):
return len(s) - len(s.strip())
def extractTextFromHTML(self, html):
try:
article.set_html(html)
article.parse()
return str(article.text)
except Exception as as1:
pass
try:
#soup = BeautifulSoup(html, "html5lib") #"lxml") # pip install -U html5lib # https://zetcode.com/python/beautifulsoup/
#soup = BeautifulSoup(html, "html.parser")
soup = BeautifulSoup(html, "lxml")
for tag in soup():
# CSS to remove attributes -> https://itecnote.com/tecnote/python-remove-all-inline-styles-using-beautifulsoup/
for attribute in ["class", "id", "name", "style"]:
del tag[attribute]
for data in soup(['style', 'script', 'comment']):
# Remove tags -> https://www.geeksforgeeks.org/remove-all-style-scripts-and-html-tags-using-beautifulsoup/
data.decompose()
myText = str(soup.find('body').text) # https://www.geeksforgeeks.org/find-the-text-of-the-given-tag-using-beautifulsoup/
return re.sub(r"\s{2,}", " ", myText)
except Exception as as1:
pass
try:
f = re.findall('\>(.*?)\<', html , re.MULTILINE | re.DOTALL)
rList = list()
lastStr = str("")
for ele in f:
ele = ele.strip()
t = self.remove_control_characters1(ele)
rList.append(t)
return str(self.remove_text_inside_brackets(" ".join(rList)))
except Exception as as1:
pass
return str("")
def headline_extractor(self, headline, soup):
# Hinweis: Wir befinden uns bei der Artikel Quelle sogut wie immer auf einer URI nicht auf einer URL, somit gehen wir davon aus, dass alle H3-Sub-Unterschriften genau einer H1-Überschrift übergeordnet sind
tmp_cont_list = list()
tmp_str = str("")
resultLists = list()
myTelerHeadlines = {}
sit = soup.findAll(headline)
headline_text = str("")
for h1 in sit:
h = h1.text.strip() # Überschrift
tags = h1.find_next_siblings() # Alle Texte unterhalb dieser Überschrift
headl_txt = self.preprocess_text(h)
headl_txt_len = len(headl_txt)
if headl_txt and headl_txt_len >= self.headline_min_length:
headline_text = headl_txt
for tag in tags:
tag_content = str(tag.get_text()) # -> not working: extractTextFromHTML(str(tag).string)
tag_content_len = len(tag_content)
if tag_content and tag_content_len >= self.headline_min_length:
count_space_int = self.count_spaces(tag_content)
if count_space_int <= self.max_spaces_count and self.count_words_regex(tag_content) >= self.min_word_count and tag_content_len >= self.min_text_length:
#print(h+" -> # \t\t Spaces:",count_space_int," -> "+tag_content+" -> Len:",tag_content_len)
#headline_text = preprocess_text(h)
tmp_str12 = self.preprocess_text(tag_content)
tmp_cont_list.append(tmp_str12)
tmp_str12 = str("")
tmp_str = str(" ".join(tmp_cont_list))
if headline.lower() == "h1":
myTelerHeadlines[headline] = headline_text
myTelerHeadlines[headline+"_text"] = headline_text
myTelerHeadlines[headline+"_text_low_quality_score"] = self.isLowQualityContent(headline_text)
myTelerHeadlines[headline+"_text_ai"] = str("") # h1-Überschrift wird nie mittels KI umgewandelt
myTelerHeadlines[headline+"_text_ai_plagscore"] = int(0) # h1-Überschrift wird nie mittels KI umgewandelt
myTelerHeadlines[headline+"_text_till_end"] = headline_text
myTelerHeadlines[headline+"_text_till_end_low_quality_score"] = self.isLowQualityContent(headline_text)
resultLists.append(myTelerHeadlines)
del myTelerHeadlines
del tmp_str
else:
myTelerHeadlines[headline] = headline_text
myTelerHeadlines[headline+"_text"] = tmp_str
myTelerHeadlines[headline+"_text_low_quality_score"] = self.isLowQualityContent(tmp_str)
myTelerHeadlines[headline+"_text_ai"] = str("")
myTelerHeadlines[headline+"_text_ai_plagscore"] = int(0)
myTelerHeadlines[headline+"_text_till_end"] = str("")
myTelerHeadlines[headline+"_text_till_end_low_quality_score"] = int(0)
resultLists.append(myTelerHeadlines)
del myTelerHeadlines
del tmp_str
return resultLists
def headline_to_html_end_extractor(self, headline, matching_text, soup):
#soup = BeautifulSoup(html, "html5lib") # wir brauchen hier das html5lib
# Code für H2-Headline bis zum finalen ""-Tag
tmp_set = set()
tmp_list = list()
hit_found = False
#[tag.name for tag in soup.find_all()] https://www.geeksforgeeks.org/get-all-html-tags-with-beautifulsoup/
for tag in soup.find_all():
tag_text = str(tag.text)
tag_content = tag.prettify() # str(tag) oder tag.prettify() enthält das HTML -> https://stackoverflow.com/questions/25729589/how-to-get-html-from-a-beautiful-soup-object
tag_name = str(tag.name)
tag_text_preprocessed = preprocess_text(tag_text)
#if self.isFound(tag_name, headline):#(headline in tag_name or headline == tag_name):
# 1#print("'"+tag_name+"'---#headline#--->>>"+tag_text)
#if self.isFound(tag_text, matching_text):# and self.isFound(tag_name, headline):
# 1#print("'"+tag_name+"'---#isFound#--->>>"+tag_text)
if self.isFound(tag_text, matching_text) and self.isFound(tag_name, headline):
#print("'"+tag_name+"'---#isFound#--->>>"+tag_text)
hit_found = True
if hit_found and tag_text_preprocessed not in tmp_set and re.findall(r'\w+[.!?]$', tag_text_preprocessed, re.MULTILINE | re.DOTALL | re.IGNORECASE | re.UNICODE):
tmp_list.append(tag_content)
tmp_set.add(tag_text_preprocessed) # der Text soll nicht doppelt sein, jedes zugefügte Wort muss eine Satzendung haben [.!?]
#print("Word with Sentence Ending:",tag_text)
#print(tag.attrs)
#print(hit_found)
#print(matching_text)
#sys.exit(1)
pre_html = str(" ".join(tmp_list))
return self.extractTextFromHTML(pre_html)
def generateBeautifulTitle(self, text):
special_char_map = {ord('ä'):'ae', ord('ü'):'ue', ord('ö'):'oe', ord('ß'):'ss'} # https://stackoverflow.com/questions/2054746/how-to-search-and-replace-utf-8-special-characters-in-python
result = re.sub(r'(\w{2,})\.(\w{2,})', 'ArtikelSchreiber.com', text)
title = result.translate(special_char_map)
title_split = list()
if title.find("|") != -1:
title_split = title.split("|")
elif title.find(" - ") != -1:
title_split = title.split(" - ") # we want this " Walking - The good choice?" and not those "4-Tage-Bart"
#print("'-' split:",title_split)
elif title.find(":") != -1:
title_split = title.split(":")
#print("':' split:",title_split)
elif title.find("?") != -1:
title_split = title.split("?")
#print("'?' split:",title_split)
elif title.find("&") != -1:
title_split = title.split("&")
#print("'&' split:",title_split)
elif title.find(",") != -1:
title_split = title.split(",")
#print("',' split:",title_split)
elif title.find(";") != -1:
title_split = title.split(";")
#print("';' split:",title_split)
if len(title_split) >= 2:
return str(title_split[0]).strip()
else:
return str(title)
return str(text)
def generateCanonicalURL(self, myTmpTitle, alternative_url, language):
seo_lang_add = dict()
seo_lang_add["de"] = str("artikel")
seo_lang_add["en"] = str("article")
seo_lang_add["it"] = str("articolo")
seo_lang_add["es"] = str("articulo")
seo_lang_add["fr"] = str("article")
special_char_map = {ord('ä'):'ae', ord('ü'):'ue', ord('ö'):'oe', ord('ß'):'ss'} # https://stackoverflow.com/questions/2054746/how-to-search-and-replace-utf-8-special-characters-in-python
regex = re.compile('[^a-zA-Z0-9 öäüßÖÄÜ]')
ws_filter = re.compile(r"\s+") # multiple space " " to one space " "
max_canonical_words = int(5) # maximum Website Title words in the final canonical url
title_tmp = myTmpTitle
title = ws_filter.sub(" ",title_tmp).strip()
title_content = str("")
title_split = list()
if title.find(":") != -1:
title_split = title.split(":")
#print("':' split:",title_split)
if title.find(" - ") != -1:
title_split = title.split(" - ") # we want this " Walking - The good choice?" and not those "4-Tage-Bart"
#print("'-' split:",title_split)
if title.find("|") != -1:
title_split = title.split("|")
#print("'|' split:",title_split)
if title.find("?") != -1:
title_split = title.split("?")
#print("'?' split:",title_split)
if title.find("&") != -1:
title_split = title.split("&")
#print("'&' split:",title_split)
if title.find(",") != -1:
title_split = title.split(",")
#print("',' split:",title_split)
if title.find(";") != -1:
title_split = title.split(";")
#print("';' split:",title_split)
if len(title_split) >= 2:
title_content_1 = str(title_split[0]).strip()
title_content_2 = str(title_split[1]).strip()
#print("DEBUG title_content_1:",title_content_1)
#print("DEBUG title_content_2:",title_content_2)
title_content_1= ''.join(c for c in title_content_1 if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() # The isalpha() method returns True if all the characters are alphabet letters (a-z).
title_content_1= regex.sub('', title_content_1).strip() # remove non letter and digits from string: https://stackoverflow.com/questions/22520932/python-remove-all-non-alphabet-chars-from-string
title_content_2= ''.join(c for c in title_content_2 if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() # The isalpha() method returns True if all the characters are alphabet letters (a-z).
title_content_2= regex.sub('', title_content_2).strip() # remove non letter and digits from string: https://stackoverflow.com/questions/22520932/python-remove-all-non-alphabet-chars-from-string
#print("DEBUG title_content_1:",title_content_1)
#print("DEBUG title_content_2:",title_content_2)
title_content_1_word_list = list(title_content_1.split(" "))
title_content_2_word_list = list(title_content_2.split(" "))
title_content_1_word_count= len(title_content_1_word_list)
title_content_2_word_count= len(title_content_2_word_list)
title_content = title_content_1
if title_content_1_word_count < 6:
#print("DEBUG title_content_1_word_count:",title_content_1_word_count)
#print("DEBUG title_content_2_word_count:",title_content_2_word_count)
max_additional_words= max_canonical_words - title_content_1_word_count
#print("DEBUG max_additional_words:",max_additional_words)
x_zero = int(0)
for x in title_content_2_word_list:
add_word = str("")
if x_zero < max_additional_words: # maximal 5 wörter pro canonical url aus dem Titel der Webseite nehmen
#try:
add_word_tmp= str(title_content_2_word_list[x_zero]) # str(title_split[1]).strip()
#print("DEBUG add_word_tmp:",add_word_tmp)
t_split_more= ''.join(c for c in add_word_tmp if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() # The isalpha() method returns True if all the characters are alphabet letters (a-z).
add_word = regex.sub('', t_split_more).strip() # remove non letter and digits from string: https://stackoverflow.com/questions/22520932/python-remove-all-non-alphabet-chars-from-string
#except Exception as a1:
# pass
title_content += str(" ")+str(add_word)
x_zero += int(1)
#print("DEBUG title_content 2:",title_content)
if len(title_content) < 1:
# nimm einfach die ersten 5 wörter
title_split = title.split(" ")
title_content = alternative_url
try:
if len(title_split) >= 5:
title_split[0] = ''.join(c for c in title_split[0] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip()
title_split[0] = regex.sub('', title_split[0]).strip()
title_split[1] = ''.join(c for c in title_split[1] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip()
title_split[1] = regex.sub('', title_split[1]).strip()
title_split[2] = ''.join(c for c in title_split[2] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip()
title_split[2] = regex.sub('', title_split[2]).strip()
title_split[3] = ''.join(c for c in title_split[3] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip()
title_split[3] = regex.sub('', title_split[3]).strip()
title_split[4] = ''.join(c for c in title_split[4] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip()
title_split[4] = regex.sub('', title_split[4]).strip()
title_content = str(title_split[0]+" "+title_split[1]+" "+title_split[2]+" "+title_split[3]+" "+title_split[4])
elif len(title_split) >= 4:
title_split[0] = ''.join(c for c in title_split[0] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip()
title_split[0] = regex.sub('', title_split[0]).strip()
title_split[1] = ''.join(c for c in title_split[1] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip()
title_split[1] = regex.sub('', title_split[1]).strip()
title_split[2] = ''.join(c for c in title_split[2] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip()
title_split[2] = regex.sub('', title_split[2]).strip()
title_split[3] = ''.join(c for c in title_split[3] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip()
title_split[3] = regex.sub('', title_split[3]).strip()
title_content = str(title_split[0]+" "+title_split[1]+" "+title_split[2]+" "+title_split[3])
elif len(title_split) >= 3:
title_split[0] = ''.join(c for c in title_split[0] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip()
title_split[0] = regex.sub('', title_split[0]).strip()
title_split[1] = ''.join(c for c in title_split[1] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip()
title_split[1] = regex.sub('', title_split[1]).strip()
title_split[2] = ''.join(c for c in title_split[2] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip()
title_split[2] = regex.sub('', title_split[2]).strip()
title_content = str(title_split[0]+" "+title_split[1]+" "+title_split[2])
elif len(title_split) >= 2:
title_split[0] = ''.join(c for c in title_split[0] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip()
title_split[0] = regex.sub('', title_split[0]).strip()
title_split[1] = ''.join(c for c in title_split[1] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip()
title_split[1] = regex.sub('', title_split[1]).strip()
title_content = str(title_split[0]+" "+title_split[1])
elif len(title_split) >= 1:
title_split[0] = ''.join(c for c in title_split[0] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip()
title_split[0] = regex.sub('', title_split[0]).strip()
title_content = str(title_split[0])
except Exception as as1:
return str(alternative_url)
pass
title_content = title_content.strip()
title_content = str(title_content.lower())
#print("DEBUG title 1:",title_content)
title_content = title_content.translate(special_char_map)
#print("DEBUG title 2:",title_content)
title_content = ws_filter.sub(" ",title_content).strip()
title_content_final = str(unidecode(title_content).replace(" ","-"))
canonical_url = "https://www.artikelschreiber.com/texts/"+str(seo_lang_add[language])+"-"+title_content_final+".html"
return canonical_url
def calculateRank(self, myAI_obj, url, session, ip_obj, serp_obj):
#telerN1 = {'mainkeyword':MainKeyword, 'subkeyword':SubKeywords, 'articleraw':articleRaw, 'articletext':b_text, 'score':score, 'sessionid':SessionID, 'summary':p_summary, 'sprachprofil':p_sprachprofil, 'headline':article_headline, 'description':p_description,'articleurl':p_articleurl, 'metakeys':p_metakeys, 'addontext_json':myAddonText_json, 'language':Language, 'aitext_json':p_aitext_json}
teler = dict()#{}
points = int(0)
if not isinstance(url, str):
teler['points'] = int(-10)
return teler
html = self.getWebpage(url)
if not isinstance(html, str):
teler['points'] = int(-10)
return teler
social_found = False
imp_found = False
title = str("")
description = str("")
video = str("")
language = str("")
h_headline = str("")
h_text = str("")
h_text_ai = str("")
h_text_till_end = str("")
teler.update(ip_obj) # IP Informationen ins aktueller teler-objekt schreiben
teler.update(serp_obj) # Wenn SERP Support aktiv, dann das hier auch aktivieren:
teler['isGoodContent'] = False
teler['openai'] = str("")
teler['fastmode'] = False
#teler['ip'] = str(ip_address)
teler['url'] = str(url)
teler['html'] = str(html)
teler['session'] = str(session)
teler["topics"] = str("")
teler["detected_language"] = str("")
teler["language"] = str("")
teler["summary"] = str("")
teler["title"] = str("")
teler['title_raw'] = str("")
teler["article_text"] = str("")
teler["article_text_ai"] = str("")
teler["article_text_ai_plagscore"] = int(0)
teler["article_text_low_quality_score"] = int(0)
teler['suggestions'] = list()
teler['canonical'] = str("https://www.artikelschreiber.com/")
teler['points'] = int(0)
teler['ai_enabled'] = False
teler["article_text_ai_raw"] = str("")
teler["openai_raw"] = str("")
teler["h1_text_ai_raw"] = str("")
teler["h2_text_ai_raw"] = str("")
teler["h3_text_ai_raw"] = str("")
teler['alternative_url'] = str("https://www.artikelschreiber.com/")
# Wenn das html genau HTML5 konform ist
if self.validateHTML(html):
points += int(1)
# kurze Url
if len(url) < 85:
points += int(1)
####HTML Quelltext >200KB: -5p
if len(html) < 200000:
points += int(1)
try:
#soup = BeautifulSoup(html, "html5lib") #"lxml") # pip install -U html5lib # https://zetcode.com/python/beautifulsoup/
soup = BeautifulSoup(html, "lxml")
except Exception as as1:
soup = BeautifulSoup(html, "html.parser")
for tag in soup():
# CSS to remove attributes -> https://itecnote.com/tecnote/python-remove-all-inline-styles-using-beautifulsoup/
for attribute in ["class", "id", "style"]:
del tag[attribute]
for data in soup(['style', 'script', 'comment']):
# Remove tags -> https://www.geeksforgeeks.org/remove-all-style-scripts-and-html-tags-using-beautifulsoup/
data.decompose()
for headline in self.headlines_list:
resultLists = self.headline_extractor(headline, soup)
if len(resultLists) > 0:
for element in resultLists:
try:
h_headline = element.get(headline)
h_text = element.get(headline+"_text")
h_text_ai = element.get(headline+"_text_ai")
h_text_low_quality_score = element.get(headline+"_text_low_quality_score")
h_text_till_end_low_quality_score = element.get(headline+"_text_till_end_low_quality_score")
teler[headline] = h_headline
teler[headline+"_text"] = h_text
teler[headline+"_text_low_quality_score"] = h_text_low_quality_score
teler[headline+"_text_ai"] = h_text_ai
teler[headline+"_text_till_end"] = str("")
teler[headline+"_text_till_end_low_quality_score"] = h_text_till_end_low_quality_score
if (self.isFound(headline,"h1") or self.isFound(headline,"h2") or self.isFound(headline,"h3")) and len(h_headline) >= self.headline_min_length and h_text_low_quality_score == 0:
# bei H1,H2,H3 gibt es Bonuspunkte, wenn Headline lang genug ist und der Text Quality Score stimmt
points += int(1)
elif (self.isFound(headline,"h1") or self.isFound(headline,"h2") or self.isFound(headline,"h3")) and len(h_headline) < self.headline_min_length:
# bei H1,H2,H3 gibt es Punktabzug, wenn die Headline zu kurz ist
points -= int(1)
elif (self.isFound(headline,"h4") or self.isFound(headline,"h5") or self.isFound(headline,"h6")) and len(h_headline) >= self.headline_min_length and h_text_low_quality_score == 0:
# bei H4,H5,H6 gibt es Bonuspunkte, wenn Headline lang genung ist und der Text Quality Score stimmt
points += int(1)
elif (self.isFound(headline,"h4") or self.isFound(headline,"h5") or self.isFound(headline,"h6")) and len(h_headline) < self.headline_min_length:
# bei bei H4,H5,H6 gibt es Punktabzug, wenn die Headline zu kurz ist
points -= int(1)
elif (self.isFound(headline,"h1") or self.isFound(headline,"h2") or self.isFound(headline,"h3")) and count_spaces(h_text) <= self.max_spaces_count and count_words_regex(h_text) >= self.min_word_count and len(h_text) >= self.min_text_length and h_text_low_quality_score == 0:
# bei H1,H2,H3 gibt es Bonuspunkte, wenn alles OK ist
points += int(1)
elif not (self.isFound(headline,"h1") or self.isFound(headline,"h2") or self.isFound(headline,"h3")) and count_spaces(h_text) <= self.max_spaces_count and count_words_regex(h_text) >= self.min_word_count and len(h_text) >= self.min_text_length and h_text_low_quality_score == 0:
# bei H4,H5,H6 gibt es Punktabzug, wenn etwas falsch ist
points -= int(1)
elif (self.isFound(headline,"h4") or self.isFound(headline,"h5") or self.isFound(headline,"h6")) and count_spaces(h_text) <= self.max_spaces_count and count_words_regex(h_text) >= self.min_word_count and len(h_text) >= self.min_text_length and h_text_low_quality_score == 0:
# bei H4,H5,H6 gibt es Bonuspunkte, wenn alles OK ist
points += int(1)
elif not (self.isFound(headline,"h4") or self.isFound(headline,"h5") or self.isFound(headline,"h6")) and count_spaces(h_text) <= self.max_spaces_count and count_words_regex(h_text) >= self.min_word_count and len(h_text) >= self.min_text_length and h_text_low_quality_score == 0:
# bei bei H1,H2,H3 gibt es Punktabzug, wenn etwas falsch ist
points -= int(1)
except Exception as as1:
pass
# ###
# die eigentliche H-Textinhaltsblöcke bis zum Ende des gesamten Textes
###
try:
teler[headline+"_text_till_end"] = self.headline_to_html_end_extractor(headline, h_headline, soup)
h_text_till_end = element.get(headline+"_text_till_end")
h_text_till_end_low_quality_score = self.isLowQualityContent(h_text_till_end)
teler[headline+"_text_till_end_low_quality_score"] = h_text_till_end_low_quality_score
if self.count_spaces(h_text_till_end) <= self.max_spaces_count and self.count_words_regex(h_text_till_end) >= self.min_word_count and len(h_text_till_end) >= self.min_text_length and h_text_till_end_low_quality_score == 0:
points += int(1) # h_text_till_end
else:
points -= int(1)
except Exception as as1:
pass
try:
#h = soup.title.text.strip() # soup.select('title')[0].text.strip() # print(soup.title)
h = self.get_title(soup)
title = str(self.preprocess_text(h))
if len(title) >= self.headline_min_length:
teler['title_raw'] = str(h)
teler['title'] = self.generateBeautifulTitle(title)
points += int(1)
else:
teler['title'] = str("") # leer setzen wenn kein Inhalt
except Exception as a1:
teler['title'] = str("") # leer setzen wenn kein Inhalt
pass
try:
h = self.get_description(soup)
description = str(self.preprocess_text(h))
if len(description) >= 100 and len(description) < 154:
teler['description'] = description
points += int(1)
else:
teler['description'] = str("") # leer setzen wenn kein Inhalt
except Exception as a1:
teler['description'] = str("") # leer setzen wenn kein Inhalt
pass
try:
h = self.get_image(soup, url)
image = str(self.preprocess_text(h))
if len(image) >= self.headline_min_length:
teler['image'] = image
points += int(1)
else:
teler['image'] = str("") # leer setzen wenn kein Inhalt
except Exception as a1:
teler['image'] = str("") # leer setzen wenn kein Inhalt
pass
try:
h = self.get_video(soup)
video = str(self.preprocess_text(h))
if len(video) >= self.headline_min_length:
teler['video'] = video
points += int(1)
else:
teler['video'] = str("") # leer setzen wenn kein Inhalt
except Exception as a1:
teler['video'] = str("") # leer setzen wenn kein Inhalt
pass
try:
og_language = self.get_og_locale(soup)
#og_language = str(self.preprocess_text(h))
og_language_len = len(og_language)
if og_language_len >= 2 and og_language_len < self.headline_min_length:
teler['language'] = str("")
teler['language'] = og_language
points += int(1)
else:
teler['language'] = str("")
except Exception as a1:
teler['language'] = str("")
pass
try:
h = self.get_links(soup, url)
if len(h) > 0:
teler['links'] = h
else:
teler['links'] = list()
except Exception as a1:
teler['links'] = list()
pass
try:
h = self.get_meta_keywords(soup, html, url)
h1 = h.replace(", ",";").strip()
h1 = h1.replace(" ,",";").strip()
h1 = h1.replace(",",";").strip()
if len(h1) > 0:
teler['keywords'] = h1
points += int(1)
elif len(h) > 0:
teler['keywords'] = h
points += int(1)
else:
teler['keywords'] = str("") # leer setzen wenn kein Inhalt
except Exception as a1:
pass
###
# Social Links finden
###
teler['socials'] = False
for socials in social_links:
if self.isFound(html, socials):
teler['socials'] = True
points += int(1)
break
###
# Wichtige Keywords finden ("Impressum") finden
###
teler['important'] = False
for imp1 in imp_keywords:
if self.isFound(html, imp1):
teler['important'] = True
points += int(1)
break
try:
article_text = self.extractTextFromHTML(html)
article_processed = str(self.preprocess_text(article_text))
article_score = self.isLowQualityContent(article_processed)
"""
h1_text = teler.get("h1_text")
h1_score = teler.get("h1_low_quality_score")
h2_text = teler.get("h2_text")
h2_score = teler.get("h2_low_quality_score")
h3_text = teler.get("h3_text")
h3_score = teler.get("h3_low_quality_score")
myText = str("")
if article_score == 0:
myText += article_text
if h1_score == 0:
myText += h1_text
if h2_score == 0:
myText += h2_text
if h3_score == 0:
myText += h3_text
#if len(myText) < self.min_text_length:
#myText = article_text+" "+h1_text+" "+h2_text+" "+h3_text
article_processed = str(self.preprocess_text(article_text))
mylow_quality_score = self.isLowQualityContent(article_processed)
# hier ist der komplette Artikel drin - Fallback
"""
teler["article_text"] = article_processed
teler["article_text_low_quality_score"] = mylow_quality_score
if self.count_spaces(article_processed) <= self.max_spaces_count and self.count_words_regex(article_processed) >= self.min_word_count and len(article_processed) >= self.min_text_length and mylow_quality_score == 0:
points += int(1)
else:
points -= int(1)
except Exception as a1:
pass
try:
myText = str("")
article_text = teler.get("article_text")
article_score = teler.get("article_text_low_quality_score")
h1_text = teler.get("h1_text")
h1_score = teler.get("h1_low_quality_score")
h2_text = teler.get("h2_text")
h2_score = teler.get("h2_low_quality_score")
h3_text = teler.get("h3_text")
h3_score = teler.get("h3_low_quality_score")
if article_score == 0:
myText += article_text
if h1_score == 0:
myText += h1_text
if h2_score == 0:
myText += h2_text
if h3_score == 0:
myText += h3_text
if len(myText) < self.min_text_length:
myText = article_text+" "+h1_text+" "+h2_text+" "+h3_text
teler["summary"] = self.doLsaSummarizer(myText)
summary_text = teler.get("summary")
if self.count_spaces(summary_text) <= self.max_spaces_count and self.count_words_regex(summary_text) >= self.min_word_count and len(summary_text) >= self.min_text_length and mylow_quality_score == 0:
points += int(1)
else:
points -= int(1)
except Exception as a1:
pass
try:
myText = str("")
article_text = teler.get("article_text")
article_score = teler.get("article_text_low_quality_score")
h1_text = teler.get("h1_text")
h1_score = teler.get("h1_low_quality_score")
h2_text = teler.get("h2_text")
h2_score = teler.get("h2_low_quality_score")
h3_text = teler.get("h3_text")
h3_score = teler.get("h3_low_quality_score")
if article_score == 0:
myText += article_text
if h1_score == 0:
myText += h1_text
if h2_score == 0:
myText += h2_text
if h3_score == 0:
myText += h3_text
if len(myText) < self.min_text_length:
myText = article_text+" "+h1_text+" "+h2_text+" "+h3_text
teler["detected_language"] = self.detectTextLanguage(myText)
except Exception as a1:
teler["detected_language"] = str("")
pass
try:
myText = str("")
article_text = teler.get("article_text")
article_score = teler.get("article_text_low_quality_score")
h1_text = teler.get("h1_text")
h1_score = teler.get("h1_low_quality_score")
h2_text = teler.get("h2_text")
h2_score = teler.get("h2_low_quality_score")
h3_text = teler.get("h3_text")
h3_score = teler.get("h3_low_quality_score")
if article_score == 0:
myText += article_text
if h1_score == 0:
myText += h1_text
if h2_score == 0:
myText += h2_text
if h3_score == 0:
myText += h3_text
if len(myText) < self.min_text_length:
myText = article_text+" "+h1_text+" "+h2_text+" "+h3_text
og_language = teler.get("language")
detected_language = teler.get("detected_language")
if len(og_language) == 2:
teler["topics"] = myAI_obj.topicModeling(myText, og_language)
else:
teler["topics"] = myAI_obj.topicModeling(myText, detected_language)
article_topics = teler.get("topics")
#pprint(article_topics)
if len(article_topics) >= self.headline_min_length*10: # 70 Zeichen
points += int(1)
else:
points -= int(1)
except Exception as a1:
teler["topics"] = str("")
pass
try:
suggestions = list()
nodouble = set()
article_keywords = teler.get("keywords")
article_topics = teler.get("topics")
if len(article_keywords) >= self.headline_min_length-4: # somit 3 Zeichen lang
topics = article_keywords.split(';')
for word_text in topics:
w_len = len(word_text)
if word_text.strip() not in nodouble and w_len > 3:
suggestions.append("mk="+word_text.strip())
nodouble.add(word_text.strip())
if len(article_topics) >= self.headline_min_length-4:
topics = article_topics.split(';')
for word_text in topics:
w_len = len(word_text)
if word_text.strip() not in nodouble and w_len > 3:
suggestions.append("mk="+word_text.strip())
nodouble.add(word_text.strip())
teler['suggestions'] = list(suggestions) # wichtig, weil Sets nicht json serialisiert werden können
except Exception as a1:
pass
try:
#alternative_url "canonical fallback link" Link:
topo = teler.get("topics")
sessionShort = str(session[:3]).lower()
topic1 = list(topo.lower().split(";"))
if len(topic1) >= 2:
topic = str(self.remove_accents(topic1[0]))
rand = str(self.remove_accents(topic1[1]))
rand2 = str(self.remove_accents(topic1[2]))
#points += int(1)
elif len(topic1) >= 1:
topic = str(self.remove_accents(topic1[0]))
rand = str(self.remove_accents(topic1[1]))
rand2 = str(self.remove_accents(topic1[1]))
#points += int(1)
elif len(topic1) >= 0:
topic = str(self.remove_accents(topic1[0]))
rand = str(self.remove_accents(topic1[0]))
rand2 = str(self.remove_accents(topic1[0]))
#points -= int(1)
else:
topic = str(self.remove_accents("blog"))
rand = str(self.remove_accents("entry"))
rand2 = str(self.remove_accents("id"))
#points -= int(1)
og_language = teler.get("language")
detected_language = teler.get("detected_language")
if len(og_language) == 2:
language = og_language
else:
language = detected_language
# old: filename = str(topic+"-"+rand+"-"+rand2+"-"+sessionShort).lower()+".html"
filename = str("article-"+topic+"-"+rand+"-"+str(language).lower()+"-"+sessionShort).lower()+".html"
# old: absolute_canonical = "https://www.artikelschreiber.com/"+str(language)+"/blog/"+filename
teler['alternative_url'] = "https://www.artikelschreiber.com/texts/"+filename
except Exception as a1:
teler['alternative_url'] = str("https://www.artikelschreiber.com/")
pass
try:
#canonical Link:
alternative_url = teler.get("alternative_url")
og_language = teler.get("language")
detected_language = teler.get("detected_language")
if len(og_language) == 2:
language = og_language
else:
language = detected_language
myTmpTitle = teler.get("title")
teler['canonical'] = self.generateCanonicalURL(myTmpTitle, alternative_url, language)
points += int(2)
except Exception as a1:
teler['canonical'] = alternative_url
points += int(1)
pass
try: # content/quellcode verhältnis zwischen 40 und 70 Prozent?
article_text = teler.get("article_text")
if len(article_text)/210000 >= 0.4 and len(article_text)/210000 < 0.71:
points += int(1)
except Exception as as1:
pass
try: ####HTML hat Google oder Bing Ads: +5p
if html.find("pagead2.googlesyndication.com/pagead/js/adsbygoogle.js") != -1:
points += int(1)
except Exception as as1:
pass
try: ####Strukturierte Daten vorhanden im HTML: +5p
if html.find("application/ld+json") != -1:
points += int(1)
except Exception as as1:
pass
try: # Wenn Text zuviele Zeichen enthält, die kein Buchstabe oder Zahl ist
article_text = teler.get("article_text")
bchar = self.countBadChars(text)
ltext = len(article_text)
bcharScore = float("{0:.3f}".format((bchar/ltext)*100))
if bcharScore >= 3.2:
points -= int(1)
except Exception as as1:
pass
try: # Wenn Text eine Paywall ist
if self.isPaywall(html):
points -= int(10)
except Exception as as1:
pass
try:#### im html dann: +5, zuerst inhalt finden, dann dort separat li elemente auslesen
ul = re.findall("", html, re.IGNORECASE)
ol = re.findall("(.*?)
", html, re.IGNORECASE)
for t in list(ul + ol):
if self.isFound("- ", t):
points += int(1)
break
except Exception as as1:
pass
"""
#TODO###Kurze Sätze mit 9 bis 13 Wörtern: +5p
"""
# Session: b459d00a59a0aca81c7415947e03746f-ARTIKELSCHREIBER_de.json enthält sehr viele falsch geschriebene Buchstaben, so ein Mist muss rausgefiltert werden und einen Rank von 0 bekommen!
content_text_special_chars = teler.get("h1_text")+teler.get("h2_text")+teler.get("h3_text")+teler.get("h4_text")+teler.get("h5_text")+teler.get("article_text")
count_broken_chars = self.count_broken_character(content_text_special_chars)
isLowQualityContent = self.isLowQualityContentText(content_text_special_chars)
if count_broken_chars > 20 or isLowQualityContent:
teler['points'] = int(0)
else:
teler['points'] = points
return teler