""" Copyright (c) 2023, Sebastian Enger, M.Sc. All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree (BSD-4-Clause). Frontend and Backend Source Code for Project: - https://www.artikelschreiber.com/ - https://www.artikelschreiben.com/ - https://www.unaique.net/ """ import logging logging.getLogger().disabled = True logging.disable(logging.WARNING) logging.disable(logging.INFO) import urllib.parse import sys sys.path.append('/home/unaique/library3') #sys.path.append('/home/unaique/library') import ssl ssl._create_default_https_context = ssl._create_unverified_context """ Fix SSL Problems: # https://gankrin.org/how-to-fix-ssl-certificate_verify_failed-error-error-in-python/ $ sudo update-ca-certificates --fresh $ export SSL_CERT_DIR=/etc/ssl/certs """ # pip3 install -U aiohttp asyncio # https://www.twilio.com/blog/-asynchrone-http-anforderungen-python-aiohttp-asyncio #import aiohttp #import asyncio import requests # pip3 install --upgrade requests import bleach #https://pypi.org/project/bleach/ from dns import resolver,reversename import json import codecs import string from string import printable from unidecode import unidecode import re import random import os import socket from bs4 import BeautifulSoup, Comment import extruct # https://github.com/scrapinghub/extruct /// pip install -U extruct from w3lib.html import get_base_url # https://github.com/scrapy/w3lib /// pip install -U w3lib import regex as re1 # pip3 install -U regex from urllib.parse import urljoin from urllib.parse import urlencode from urllib.parse import urlparse import urllib3 urllib3.disable_warnings() from newspaper import Article #import newspaper #pip3 install -U newspaper3k article = Article('') # compile regexes, so we don't do this on the fly and rely on caching #from typing import Any, Dict, Pattern from typing import Pattern from langdetect import detect # also https://github.com/saffsd/langid.py - pip3 install -U langdetect import langid # pip3 install -U langid from langua import Predict # pip3 install -U langua https://github.com/whiletruelearn/langua import pycld2 as cld2 # pip install -U pycld2 #from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser # pip3 install -U sumy from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.summarizers.reduction import ReductionSummarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words from html import unescape import geoip2.database geoReader = geoip2.database.Reader("/home/unaique/library3/geoip/GeoLite2-City.mmdb") # https://github.com/P3TERX/GeoLite.mmdb - Downloads #https://geoip2.readthedocs.io/en/latest/ # pip3 install -U geoip2 # https://download.maxmind.com/app/geoip_download?edition_id=GeoLite2-City&license_key=############&suffix=tar.gz # Account/User ID ########### # License key ################ # pip3 install -U IP2Location #https://www.ip2location.com/development-libraries/ip2location/python #https://github.com/chrislim2888/IP2Location-Python # Web: https://lite.ip2location.com/ - User: Sebastian@rechthaben.net - Pass: zVkXkqH6v+8SG@$ # Direct Download: https://www.ip2location.com/download/?token=##############&file=DB11LITEBIN # Free Download: Sources: https://download.ip2location.com/lite/ import IP2Location database = IP2Location.IP2Location(os.path.join("data", "/home/unaique/library3/geoip/IP2LOCATION-LITE-DB11.BIN")) import unicodedata # pip3 install -U unidecode import psutil # pip3 install -U psutil https://www.geeksforgeeks.org/how-to-get-current-cpu-and-ram-usage-in-python/ import datetime as dt from html5validate import validate #pip3 install -U html5lib from lxml import etree from io import StringIO # https://hackersandslackers.com/scrape-metadata-json-ld/ # https://hackersandslackers.com/scraping-urls-with-beautifulsoup/ # https://github.com/hackersandslackers/beautifulsoup-tutorial # https://github.com/practical-data-science/ecommercetools # https://github.com/mozilla/bleach # next and previous tags: https://stackoverflow.com/questions/53437616/beautifulsoup-find-elements-directly-below-and-above-heading-with-specific-str """ from AIlify import AI myAI = AI() #print(type(myAI)) #sys.exit(1) """ """ # Start: Globale Variablen für Rankify headline_min_length = 7 # chars min_word_count = 7 # words min_text_length = 140 # chars max_spaces_count = 5 # ' ' headlines_list = ["h1","h2","h3","h4","h5","h6"] # Ende: Globale Variablen für Rankify """ VALUESERP_KEY = '########################' punct = set(string.punctuation) UserAgent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" UserAgentMobile = "Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_2 like Mac OS X) AppleWebKit/603.2.4 (KHTML, like Gecko) FxiOS/7.5b3349 Mobile/14F89 Safari/603.2.4"#"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" Headers = {'user-agent': UserAgent, 'Content-Type': 'text/html; charset=utf-8', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate'} #HeadersSimple = {'user-agent': UserAgentMobile, 'Content-Type': 'text/html; charset=utf-8', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate'} HeadersSimple = {'user-agent': UserAgentMobile, 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate'} HeadersSimpleADV = {'user-agent': UserAgentMobile, 'Authorization': 'ce78143f444846d14d338f0da26a2434', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate'} stoplist_domains = "/home/unaique/library3/blacklists/adblock.txt" # Quelle: https://github.com/carlospolop/MalwareWorld und https://malwareworld.com/textlists/suspiciousDomains.txt stoplist_shops = "/home/unaique/library3/blacklists/shops.txt" stoplist_paywall = "/home/unaique/library3/blacklists/paywall.txt" stoplist_porn = "/home/unaique/library3/blacklists/pornstopwordlist.txt" stoplist_captcha = "/home/unaique/library3/blacklists/captcha.txt" f_domains = open(stoplist_domains, "r", encoding='utf-8') f_shops = open(stoplist_shops, "r", encoding='utf-8') f_paywall = open(stoplist_paywall, "r", encoding='utf-8') f_porn = open(stoplist_porn, "r", encoding='utf-8') f_captcha = open(stoplist_captcha, "r", encoding='utf-8') """ stoplist_domains_blocker = list(set(f_domains.readlines())) stoplist_shops_blocker = list(set(f_shops.readlines())) stoplist_paywall_blocker = list(set(f_paywall.readlines())) stoplist_porn_blocker = list(set(f_porn.readlines())) stoplist_captcha_blocker = list(set(f_captcha.readlines())) """ RE_LINEBREAK: Pattern = re.compile(r"(\r\n|[\n\v])+") RE_NONBREAKING_SPACE: Pattern = re.compile(r"[^\S\n\v]+") RE_ZWSP: Pattern = re.compile(r"[\u200B\u2060\uFEFF]+") RE_BRACKETS_CURLY = re.compile(r"\{[^{}]*?\}") RE_BRACKETS_ROUND = re.compile(r"\([^()]*?\)") RE_BRACKETS_SQUARE = re.compile(r"\[[^\[\]]*?\]") RE_BULLET_POINTS = re.compile( # require bullet points as first non-whitespace char on a new line, like a list r"((^|\n)\s*?)" r"([\u2022\u2023\u2043\u204C\u204D\u2219\u25aa\u25CF\u25E6\u29BE\u29BF\u30fb])", ) # source: https://gist.github.com/dperini/729294 RE_URL: Pattern = re.compile( r"(?:^|(?= 224.0.0.0 # excludes network & broadcast addresses # (first & last IP address of each class) r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" r"|" # host name r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)" # domain name r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*" # TLD identifier r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))" r")" # port number r"(?::\d{2,5})?" # resource path r"(?:/\S*)?" r"(?:$|(?![\w?!+&/]))", flags=re.IGNORECASE, ) RE_SHORT_URL: Pattern = re.compile( r"(?:^|(?= 1: print("String contains {} Special Character/s ".format(special_char)) else: print("There are no Special Characters in this String.") """ return special_char def doSerpSearch(self, MainKeyword, SubKeywords, language, ip_obj): search_query = str(MainKeyword+", "+SubKeywords) isYoutubeFlag = False gl_google_country = { # https://www.valueserp.com/docs/search-api/reference/google-countries # ascom internal language shortcode: google country equivalent "de": "de", "en": "us", "es": "es", "fr": "fr", "it": "it", "ru": "ru", "cn": "cn", "zh": "cn", "jp": "jp", "ja": "jp", "pt": "pt", "in": "in", "hi": "in", "sa": "sa", "ar": "sa", "tr": "tr", } gl_google_language = { # https://www.valueserp.com/docs/search-api/reference/google-countries # ascom internal language shortcode: google country equivalent "de": "lang_de", "en": "lang_en", "es": "lang_es", "fr": "lang_fr", "it": "lang_it", "ru": "lang_ru", "cn": "lang_zh-tw", "zh": "lang_zh-tw", "jp": "lang_ja", "ja": "lang_ja", "pt": "lang_pt", "in": "lang_en", "hi": "lang_en", "sa": "lang_ar", "ar": "lang_ar", "tr": "lang_tr", } """ The gl parameter determines the Google country to use for the query. View the full list of supported glvalues https://www.valueserp.com/docs/search-api/reference/google-countries. Defaults to us. The hl parameter determines the Google UI language to return results. View the full list of supported hlvalues https://www.valueserp.com/docs/search-api/reference/google-languages. Defaults to en. """ params = { 'api_key': VALUESERP_KEY, 'flatten_results': True, #'include_answer_box': True, # flatten_results und include_answer_box funktionieren beide nicht zusammen #'include_advertiser_info': True, # dies kostet 1 Creditpunkt mehr! somit 2 statt nur 1 beim weglassen dieser include_advertiser_info option 'q': search_query, 'gl': gl_google_country[language], 'hl': 'de', 'num': '75', #'100', 'location': "lat:"+ip_obj.get("ip_latitude")+",lon:"+ip_obj.get("ip_longitude"), 'safe': 'active',# or enable safe search: 'active', disable safe mode: "off" 'lr': gl_google_language[language] # https://www.valueserp.com/docs/search-api/reference/google-lr-languages } if isYoutubeFlag: params['search_type'] = 'videos' myteler = dict() youtube_videos = list() related_searches = list() related_questions = list() organic_results = list() answer_box = list() isSuccess = False api_result_content = str("") try: api_result = requests.get('https://api.valueserp.com/search', params, timeout=300) if api_result.status_code == 200: api_result_content = api_result.json() isSuccess = api_result_content.get("request_info", False).get("success", False) except Exception as as1: pass """ r = codecs.open("/home/unaique/library3/ARCHIV/valueserp.json", 'r', encoding='utf-8') api_result_content = json.loads(r.read()) isSuccess = True r.close() """ myteler["serp"] = api_result_content # ist ein dict, muss mit print(json.dumps(myteler["serp"], indent=2)) zu json umgewandelt werden if not isSuccess: myteler["isSuccess"] = False myteler["serp_youtube_videos"] = list() myteler["serp_related_searches"] = list() myteler["serp_faq_related_questions"] = list() myteler["serp_organic_results"] = list() myteler["serp_mainkeyword"] = str(MainKeyword) myteler["serp_subkeywords"] = str(SubKeywords) return myteler elif isSuccess: # All options here: https://www.valueserp.com/docs/search-api/results/google/search myteler["isSuccess"] = True if isYoutubeFlag: inline_videos_objs = api_result_content.get("video_results","") for iv_obj in inline_videos_objs: yt = dict() yt_title = str(iv_obj.get("title","")) yt_link = str(iv_obj.get("link","")) #yt_date = str(iv_obj.get("date","")) yt_date_utc = str(iv_obj.get("date_utc","")) yt_snippet = str(iv_obj.get("snippet","")) yt_image = str(iv_obj.get("image","")) yt_length = str(iv_obj.get("length","")) if len(yt_title) > 3 and len(yt_link) > 3 and yt_link.find("https://www.youtube.com/") != -1: yt["youtube_video"] = yt_link yt["youtube_title"] = yt_title yt["youtube_date_utc"] = yt_date_utc yt["youtube_snippet"] = yt_snippet yt["youtube_length"] = yt_length yt["youtube_image"] = yt_image youtube_videos.append(yt) return youtube_videos else: related_searches_objs = api_result_content.get("related_searches","") for rs_obj in related_searches_objs: rs = dict() rs_title = str(rs_obj.get("query","")) if len(rs_title) > 3: rs["related_searches"] = rs_title related_searches.append(rs) related_questions_objs = api_result_content.get("related_questions","") for rq_obj in related_questions_objs: rq = dict() rq_source_link = str("") rq_source_title = str("") try: rq_source_link = str(rq_obj.get("source","").get("link","")) rq_source_title = str(rq_obj.get("source","").get("title","")) except Exception as as1: pass rq_question = str(rq_obj.get("question","")) rq_answer = str(rq_obj.get("answer","")) if len(rq_question) > 3 and len(rq_answer) > 3 and len(rq_source_title) > 3 and len(rq_source_link) > 3: rq["faq_question"] = rq_question rq["faq_answer"] = rq_answer rq["faq_source_link"] = rq_source_link rq["faq_source_title"] = rq_source_title related_questions.append(rq) organic_results_objs = api_result_content.get("organic_results","") for or_obj in organic_results_objs: or_d = dict() or_position = str(or_obj.get("position","")) or_title = str(or_obj.get("title","")) or_link = str(or_obj.get("link","")) or_snippet = str(or_obj.get("snippet","")) or_cached_page_link = str(or_obj.get("cached_page_link","")) # https://webcache.googleusercontent.com/search?q=cache:or_4V5fHFHkJ:https://www.gruender.de/vertrieb/amazon-seo/&cd=12&hl=de&ct=clnk&gl=de&lr=lang_de&vwsrc=1 or_about = str("") try: or_about = str(or_obj.get("about_this_result","").get("your_search_and_this_result","")) except Exception as as1: pass or_date = str(or_obj.get("date","")) or_date_utc = str(or_obj.get("date_utc","")) or_rich_snippet_rating = str("") or_rich_snippet_reviews = str("") try: or_rich_snippet_rating = str(or_obj.get("rich_snippet","").get("top","").get("detected_extensions","").get("rating",int(0))) or_rich_snippet_reviews = str(or_obj.get("rich_snippet","").get("top","").get("detected_extensions","").get("reviews",int(0))) except Exception as as1: pass if (len(or_position) >= 1 and len(or_title) > 3 and len(or_link) > 3 and len(or_snippet) > 3) or (len(or_title) > 3 and len(or_link) > 3 and or_link.find("youtube.com/") != -1 or or_link.find("youtu.be/") != -1or or_link.find("http://") != -1 or or_link.find("https://") != -1): isUrlBlacklisted = self.isDomainBlacklisted(or_link) if not isUrlBlacklisted: or_d["organic_results_position"] = or_position or_d["organic_results_title"] = or_title or_d["organic_results_link"] = or_link or_d["organic_results_snippet"] = or_snippet or_d["organic_results_cached_page"] = or_cached_page_link or_d["organic_results_about_list"] = or_about or_d["organic_results_date"] = or_date or_d["organic_results_date_utc"] = or_date_utc or_d["organic_results_rich_snippet_rating"] = or_rich_snippet_rating or_d["organic_results_rich_snippet_reviews"] = or_rich_snippet_reviews organic_results.append(or_d) inline_videos_objs = api_result_content.get("inline_videos","") for iv_obj in inline_videos_objs: yt = dict() yt_title = str(iv_obj.get("title","")) yt_link = str(iv_obj.get("link","")) yt_date_utc = str(iv_obj.get("date_utc","")) yt_snippet = str(iv_obj.get("snippet","")) yt_image = str(iv_obj.get("image","")) yt_length = str(iv_obj.get("length","")) # Source: https://gist.github.com/rodrigoborgesdeoliveira/987683cfbfcc8d800192da1e73adc486 if len(yt_title) > 3 and len(yt_link) > 3 and ((yt_link.find("youtube.com/") != -1 and yt_link.find("watch?v=") != -1) or yt_link.find("youtu.be") != -1 or yt_link.find("youtube.com/embed/") != -1 or yt_link.find("youtube-nocookie.com/embed/") != -1): yt["youtube_video"] = yt_link yt["youtube_title"] = yt_title yt["youtube_date_utc"] = yt_date_utc yt["youtube_snippet"] = yt_snippet yt["youtube_length"] = yt_length yt["youtube_image"] = yt_image youtube_videos.append(yt) if len(youtube_videos) < 1: for my_obj in organic_results: yt = dict() yt_link = str(my_obj.get("organic_results_link")) my_content_title = str(my_obj.get("organic_results_title")) if len(yt_link) > 3 and ((yt_link.find("youtube.com/") != -1 and yt_link.find("watch?v=") != -1) or yt_link.find("youtu.be") != -1 or yt_link.find("youtube.com/embed/") != -1 or yt_link.find("youtube-nocookie.com/embed/") != -1): yt["youtube_video"] = yt_link yt["youtube_title"] = my_content_title yt["youtube_date_utc"] = str("") yt["youtube_snippet"] = str("") yt["youtube_length"] = str("") yt["youtube_image"] = str("") youtube_videos.append(yt) myteler["serp_youtube_videos"] = youtube_videos myteler["serp_related_searches"] = related_searches myteler["serp_faq_related_questions"] = related_questions myteler["serp_organic_results"] = organic_results myteler["serp_mainkeyword"] = str(MainKeyword) myteler["serp_subkeywords"] = str(SubKeywords) return myteler def is_url(self, url): try: result = urlparse(url) return all([result.path]) except ValueError: return False def validateHTML(self, html): """ import sys # https://github.com/danthedeckie/html5validate sys.path.append('/home/unaique/library') from html5validate import validate #pip3 install -U html5lib from lxml import etree from io import StringIO """ try: validate(html) return True except Exception as e1: try: parser = lxml.etree.HTMLParser(recover=False) lxml.etree.parse(StringIO(html), parser) return True except Exception as e1: return False return False # Ende: Globale Variablen für Rankify def countBadChars(self, text): if len(text) < 1: return 0 text = str(text) #return sum(not c.isalnum() and not c.isspace() for c in text) return sum(c in string.punctuation for c in text) def useFastmode(self, text): # Load muss (!) zuerst, da er essentiell für den Server ist # kein Fastmode während der Entwicklungsphase isPorn = self.isPornBlacklisted(text) if isPorn: return True load1, load5, load15 = psutil.getloadavg() if load1 >= self.max_load_fastmode: return True hourOfDay = int(dt.datetime.now().hour) if hourOfDay >= 8 and hourOfDay <= 16:# Stoßzeiten von 8 Uhr früh bis 16 Uhr Nachmittags, dort geringere Wahrscheinlichkeit für AI random_number = random.randint(0, 100) else: random_number = random.randint(0, 75) if random_number == 1: # bei 2,3,4,5 also bei 1/5 fünftel der Anfragen dürfen wir KEIN KI nutzen, wir nehmen KI nur, wenn der Wert "int(1)" kommt return False # hier keine Else Schleife, weil wir noch den Load gucken können #return False # keine Fastmode, sprich wir dürfen KI Modul benutzen return True def remove_accents(self, input_str): nfkd_form = unicodedata.normalize('NFKD', input_str) only_ascii = nfkd_form.encode('ASCII', 'ignore') s_string = str(only_ascii.decode('ASCII', "ignore")) return str(s_string) def isIntent(self, text): informative_intent = ['what','who','when','where','which','why','how'] transactional_intent = ['buy','order','purchase','cheap','price','discount','shop','sale','offer'] commercial_intent = ['best','top','review','comparison','compare','vs','versus','guide','ultimate'] custom_intent = ['google','facebook','youtube','twitter','instagram','ebay','microsoft','apple','amazon','whatsapp'] # https://majestic.com/reports/majestic-million for element in informative_intent + transactional_intent + commercial_intent + custom_intent: if hasMatch(text, element): return True return False def getGeoInformationForLogging(self, ip): if len(ip) < 7: ip = "1.1.1.1" try: response = geoReader.city(ip) rec = database.get_all(ip) iso = str(response.country.iso_code) cn = str(response.country.name) ctn = str(response.city.name) pc = str(response.postal.code) lat = str(response.location.latitude) lon = str(response.location.longitude) region = str(response.subdivisions.most_specific.name) #return (iso+","+cn+","+ctn+","+pc+","+lat+","+lon) #return str(rec.country_short+","+rec.country_long+","+rec.region+","+rec.city+","+rec.zipcode+","+rec.latitude+","+rec.longitude+","+ip) cn1 = str() region1 = str() cn1 = rec.country_long region1 = response.city.name if cn1 is None: cn1 = str(response.country.name) else: cn1 = str(rec.country_long) if region1 is None: region1 = str(rec.region) else: region1 = str(response.city.name) return str(cn1+" -> "+region1) except Exception as a1: pass return str("iso"+","+"cn"+","+"region"+","+"ctn"+","+"pc"+","+"lat"+","+"lon"+","+"ip") def getGeoInformation(self, ip): if len(ip) < 7: ip = "1.1.1.1" try: response = geoReader.city(ip) iso = str(response.country.iso_code) cn = str(response.country.name) ctn = str(response.city.name) pc = str(response.postal.code) lat = str(response.location.latitude) lon = str(response.location.longitude) region = str(response.subdivisions.most_specific.name) #return (iso+","+cn+","+ctn+","+pc+","+lat+","+lon) return str(iso+","+cn+","+region+","+ctn+","+pc+","+lat+","+lon+","+ip) except Exception as a1: pass return str("iso"+","+"cn"+","+"region"+","+"ctn"+","+"pc"+","+"lat"+","+"lon"+","+"ip") def getGeoInformation2(self, ip): if len(ip) < 7: ip = "1.1.1.1" try: rec = database.get_all(ip) return str(rec.country_short+","+rec.country_long+","+rec.region+","+rec.city+","+rec.zipcode+","+rec.latitude+","+rec.longitude+","+ip) except Exception as as1: pass return str("iso"+","+"cn"+","+"region"+","+"ctn"+","+"pc"+","+"lat"+","+"lon"+","+"ip") def getGeoInformationAdvanced(self, ip): ip_obj = dict() iso = str("") cn = str("") ctn = str("") pc = str("") lat = str("48.364727") lon = str("10.900789") region = str("") iso2 = str("") cn2 = str("") ctn2 = str("") pc2 = str("") lat2 = str("") lon2 = str("") region2 = str("") my_hostname = str("") try: my_hostname = str(socket.gethostbyaddr(ip)[0]) except Exception as a1: pass if len(my_hostname) < 1: try: my_hostname = str(socket.getnameinfo((ip, 0), 0)[0]) except Exception as a1: pass if len(my_hostname) < 1: try: addr = reversename.from_address(ip) my_hostname = str(str(resolver.query(addr,"PTR")[0])) except Exception as a1: pass if len(my_hostname) < 1: my_hostname = str(ip) ip_obj["ip_iso_code"] = iso ip_obj["ip_city"] = ctn ip_obj["ip_region"] = region ip_obj["ip_country"] = cn ip_obj["ip_postalcode"] = pc ip_obj["ip_latitude"] = lat ip_obj["ip_longitude"] = lon ip_obj["ip_address"] = str(ip) ip_obj["ip_hostname"] = str(my_hostname) lat = str("") # Fallback, weil sonst in doSerpSearch() die LatLon Werte fehlen, wenn kein IP-to-Geo Auflösung funktioniert lon = str("") if len(ip) < 7: return ip_obj try: rec = database.get_all(ip) iso = str(rec.country_short) cn = str(rec.country_long) ctn = str(rec.city) pc = str(rec.zipcode) lat = str(rec.latitude) lon = str(rec.longitude) region = str(rec.region) #return str(rec.country_short+","+rec.country_long+","+rec.region+","+rec.city+","+rec.zipcode+","+rec.latitude+","+rec.longitude+","+ip) except Exception as as1: pass if len(iso) < 1 or len(cn) < 1 or len(region) < 1 or len(ctn) < 1 or len(pc) < 1 or len(lat) < 1 or len(lon) < 1: try: response = geoReader.city(ip) iso2 = str(response.country.iso_code) cn2 = str(response.country.name) ctn2 = str(response.city.name) pc2 = str(response.postal.code) lat2 = str(response.location.latitude) lon2 = str(response.location.longitude) region2 = str(response.subdivisions.most_specific.name) except Exception as a1: pass if len(iso) < 1: iso = iso2 if len(cn) < 1: cn = cn2 if len(region) < 1: region = region2 if len(ctn) < 1: ctn = ctn2 if len(pc) < 1: pc = pc2 if len(lat) < 1: lat = lat2 if len(lon) < 1: lon = lon2 ip_obj["ip_iso_code"] = iso ip_obj["ip_city"] = ctn ip_obj["ip_region"] = region ip_obj["ip_country"] = cn ip_obj["ip_postalcode"] = pc ip_obj["ip_latitude"] = lat ip_obj["ip_longitude"] = lon ip_obj["ip_address"] = str(ip) ip_obj["ip_hostname"] = str(my_hostname) return ip_obj def beautifyUpperLowercase(self, text): text1=self._beautifyUpperLowercase(text) text2=self._beautifyUpperLowercaseUmlauts(text1) return text2 def _beautifyUpperLowercase(self, strString): #https://pypi.python.org/pypi/regex/ #http://code.activestate.com/recipes/576984-split-a-string-on-capitalized-uppercase-char-using/ if not isinstance(strString, str): strString=strString.decode("utf8") #result = re.sub(u'(?<=[A-Za-z])(?=[A-Z][a-z])', '. ', strString, re.UNICODE) result = re.sub(u'(?<=[A-Za-z])(?=[A-Z][a-z])', ' ', strString, re.UNICODE) return result def _beautifyUpperLowercaseUmlauts(self, strString): if not isinstance(strString, str): strString=strString.decode("utf8") #result = re.sub(u'(?<=[a-z])(?=[ÜÖÄ])', '. ', strString, re.UNICODE) result = re.sub(u'(?<=[a-z])(?=[ÜÖÄ])', ' ', strString, re.UNICODE) return result def doLsaSummarizer(self, text): # pip3 install -U sumy # https://github.com/miso-belica/sumy/tree/main/sumy/data/stopwords Language = self.detectTextLanguage(text) if "en" in Language.lower() or "en" == Language.lower(): LANGUAGE = "english" elif "fr" in Language.lower() or "fr" == Language.lower(): LANGUAGE = "french" elif "es" in Language.lower() or "es" == Language.lower(): LANGUAGE = "spanish" elif "it" in Language.lower() or "it" == Language.lower(): LANGUAGE = "italian" elif "ru" in Language.lower() or "ru" == Language.lower(): LANGUAGE = "slovak" elif "zh" in Language.lower() or "zh" == Language.lower() or "cn" in Language.lower() or "cn" == Language.lower(): LANGUAGE = "chinese" elif "de" in Language.lower() or "de" == Language.lower(): LANGUAGE = "german" elif "pt" in Language.lower() or "pt" == Language.lower(): LANGUAGE = "portuguese" elif "jp" in Language.lower() or "jp" == Language.lower(): LANGUAGE = "japanese" elif "hi" in Language.lower() or "hi" == Language.lower() or "in" in Language.lower() or "in" == Language.lower(): LANGUAGE = "hindi" hindi = self.split_sentences(text) if len(hindi) >= 3: return str(" ".join(hindi[:3])) else: return str(" ".join(hindi)) elif "ar" in Language.lower() or "ar" == Language.lower() or "sa" in Language.lower() or "sa" == Language.lower(): LANGUAGE = "arabic" hindi = self.split_sentences(text) if len(hindi) >= 3: return str(" ".join(hindi[:3])) else: return str(" ".join(hindi)) else: LANGUAGE = "english" SENTENCES_COUNT = 3 text = self.beautifyUpperLowercase(text) parser = PlaintextParser.from_string(text,Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summarizer.null_words = get_stop_words(LANGUAGE) contentText = str("") sent_list = list() s_count = 0 for sentence in summarizer(parser.document, SENTENCES_COUNT): if s_count <= SENTENCES_COUNT: s_sent = str(sentence) #contentText=contentText+s_sent+" " sent_list.append(s_sent) s_count+=1 return str(" ".join(sent_list)) def detectTextLanguage0(self, text): # https://github.com/whiletruelearn/langua #langT =str("en") text =str(text) pLangDetect = Predict() lang = str() try: lang = pLangDetect.get_lang(text) except Exception as e: pass#print("Language Detection failed: ",e) if lang in ['de','en','es','it','fr','ru','zh-cn','ja','pt','hi','ar','tr']: if lang == 'ja': return 'jp' if lang in 'zh-cn': return 'cn' if lang == 'hi': return 'in' if lang == 'ar': return 'sa' if lang == 'zh': return 'cn' return lang.lower() return str("") def detectTextLanguage1(self, text): #langT =str("en") text = str(text) lang = str() try: lang = detect(text) except Exception as e: pass#print("Language Detection failed: ",e) if lang in ['de','en','es','it','fr','ru','zh-cn','ja','pt','hi','ar','tr']: if lang == 'zh-cn': return 'cn' if lang == 'ja': return 'jp' if lang == 'hi': return 'in' if lang == 'ar': return 'sa' if lang == 'zh': return 'cn' return lang.lower() return str("") def detectTextLanguage2(self, text): # https://github.com/saffsd/langid.py lang = str() #langT = str("en") text = str(text) langid.set_languages(['de','en','es','it','fr','ru','zh','ja','pt','hi','ar','tr']) #try: langTemp= langid.classify(text) lang = langTemp[0].lower() #if lang in ['de','en','es','it','fr']: if lang in ['de','en','es','it','fr','ru','zh','ja','pt','hi','ar','tr']: if lang == 'ja': return 'jp' if lang == 'hi': return 'in' if lang == 'ar': return 'sa' if lang == 'zh': return 'cn' return lang.lower() #except Exception as e: # pass#print("Language Detection failed: ",e) return str("") def detectTextLanguage3(self, text): # https://github.com/aboSamoor/pycld2/blob/master/cld2/internal/generated_language.h #langT = str("en") text = str(text) lang = str() try: isReliable, textBytesFound, details = cld2.detect( text, bestEffort=True ) lang = str(details[0][1]).lower() except Exception as e: pass#print("Language Detection failed: ",e) if lang in ['de','en','es','it','fr','ru','zh','ja','pt','hi','ar','tr']: if lang == 'ja': return 'jp' if lang == 'hi': return 'in' if lang == 'ar': return 'sa' if lang == 'zh': return 'cn' return lang.lower() return str("") def detectTextLanguage(self, text): # Language Detection via Fasttext: https://fasttext.cc/docs/en/language-identification.html text = str(text) languz0 = str() languz1 = str() languz2 = str() languz3 = str() resList = list() try: languz0 = self.detectTextLanguage0(text) if len(languz0) == 2: resList.append(str(languz0.lower())) except Exception as as1: pass try: languz1 = self.detectTextLanguage1(text) if len(languz1) == 2: resList.append(str(languz1.lower())) except Exception as as1: pass try: languz2 = self.detectTextLanguage2(text) if len(languz2) == 2: resList.append(str(languz2.lower())) except Exception as as1: pass try: languz3 = self.detectTextLanguage3(text) if len(languz3) == 2: resList.append(str(languz3.lower())) except Exception as as1: pass return self.most_frequent(resList) def most_frequent(self, List): return max(set(List), key = List.count) def fixBrokenHTMLEntities(self, text): """ a.) &39; (=DIGIT ONLY!) b.) replace "&" mit "&#" c.) html.unescape() richtig ist : "'" """ ###text = "hd&39;Hc&36;" # match31 = '\&[a-z0-9A-Z]+\;' text = str(text) match37 = '\&[0-9]+\;' # &39; regex37 = re.compile(match37, re.IGNORECASE) match_obj = regex37.findall(text) for elm in match_obj: elm_org = elm.replace("&","&#") elm_sub = unescape(elm_org) text = text.replace(elm, elm_sub) #print("elm_org",elm_org) #print("elm_sub",elm_sub) del match37 del regex37 del match_obj text = str(text) match37 = '\&#[0-9]+\;' # t' regex37 = re.compile(match37, re.IGNORECASE) match_obj = regex37.findall(text) for elm in match_obj: elm_sub = unescape(elm) text = text.replace(elm, elm_sub) #print("elm_org",elm_org) #print("elm_sub",elm_sub) return text def isDomainBlacklisted(self, url): #return self.isBlacklisted(url, self.stoplist_domains_blocker) domain = str("") try: domain = urlparse(url).netloc except Exception as as1: pass if domain in self.stopdict_domains_blocker: return True return False def isPaywall(self, text): # return True: Blackliste gefunden # return False: Blackliste nicht gefunden #stoplist = "/home/unaique/library/blacklists/paywall.txt" return self.isBlacklisted(url, self.stoplist_paywall_blocker) def isPornBlacklisted(self, text): return self.isBlacklisted(text, self.stoplist_porn_blocker) def split_sentences(self, text): text = " " + text + " " text = text.replace("\n"," ") text = re.sub(prefixes,"\\1",text) text = re.sub(websites,"\\1",text) if "Ph.D" in text: text = text.replace("Ph.D.","PhD") text = re.sub("\s" + alphabets + "[.] "," \\1 ",text) text = re.sub(acronyms+" "+starters,"\\1 \\2",text) text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1\\2\\3",text) text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1\\2",text) text = re.sub(" "+suffixes+"[.] "+starters," \\1 \\2",text) text = re.sub(" "+suffixes+"[.]"," \\1",text) text = re.sub(" " + alphabets + "[.]"," \\1",text) if "”" in text: text = text.replace(".”","”.") if "\"" in text: text = text.replace(".\"","\".") if "!" in text: text = text.replace("!\"","\"!") if "?" in text: text = text.replace("?\"","\"?") text = text.replace(".",".") text = text.replace("?","?") text = text.replace("!","!") text = text.replace("",".") sentences = text.split("") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences def getKnowledgeGraphMoreInfo(self, search, language): link = "https://kgsearch.googleapis.com/v1/entities:search?key=#######&limit=1&languages="+str(language)+"&indent=False&query="+str(search) response1 = self.getWebpage(link) sentences = str("") if self.is_json(response1): try: response = json.loads(response1) for element in response['itemListElement']: sentences = element['result']['detailedDescription']['articleBody'] except Exception as a1: pass sents = self.split_sentences(sentences) sentences_c1 = sentences.count('.') sentences_c2 = sentences.count('!') sentences_c3 = sentences.count('?') if len(sents) > 0: return str(sents[0]) #sentences.split('.')[0] elif sentences_c1 == 1 or sentences_c2 == 1 or sentences_c3 == 1: return str(sentences) elif len(sentences) > 10: return str(sentences) return str("") def is_json(self, myjson): try: json.loads(myjson) except ValueError as e: return False return True def BingAPI(self, MainKeyword, SubKeywords, Language, enableFastmode): enableFastmode = self.useFastmode(MainKeyword+" "+SubKeywords) res_set = set() addSearchMK = str() addSearchSK = str() maxCounter = int(10) # Quant gibt maximal 10 results her if enableFastmode: maxCounter = int(5) mk_words = self.count_words_regex(MainKeyword) sk_words = self.count_words_regex(SubKeywords) """ if (MainKeyword.count(" ") == 0 or mk_words == 0) and not enableFastmode: addSearchMK = self.getKnowledgeGraphMoreInfo(MainKeyword, Language) if (SubKeywords.count(" ") == 0 or sk_words == 0) and not enableFastmode: addSearchSK = self.getKnowledgeGraphMoreInfo(SubKeywords, Language) """ if mk_words >= 10: #addSearchSK_v2 = getKnowledgeGraphMoreInfo(SubKeywords, Language) #query_string = str(SubKeywords+", "+addSearchSK_v2) query_string = str(SubKeywords) elif sk_words >= 10: #addSearchMK_v2 = getKnowledgeGraphMoreInfo(MainKeyword, Language) #query_string = str(MainKeyword+", "+addSearchMK_v2) query_string = str(MainKeyword) else: #query_string = str(MainKeyword+", "+addSearchMK+", "+SubKeywords+", "+addSearchSK) query_string = str(MainKeyword+", "+SubKeywords) if "de" in Language.lower() or Language.lower() == "de": lang = "de" elif "en" in Language.lower() or Language.lower() == "en": lang = "en" elif "fr" in Language.lower() or Language.lower() == "fr": lang = "fr" elif "es" in Language.lower() or Language.lower() == "es": lang = "es" elif "it" in Language.lower() or Language.lower() == "it": lang = "it" elif "ru" in Language.lower() or Language.lower() == "ru": lang = "ru" elif "zh" in Language.lower() or Language.lower() == "zh" or "cn" in Language.lower() or Language.lower() == "cn": lang = "CHT" elif "pt" in Language.lower() or Language.lower() == "pt": lang = "pt" elif "jp" in Language.lower() or Language.lower() == "jp": lang = "ja" elif "hi" in Language.lower() or Language.lower() == "hi" or "in" in Language.lower() or Language.lower() == "in": lang = "en" elif "ar" in Language.lower() or Language.lower() == "ar" or "sa" in Language.lower() or Language.lower() == "sa": lang = "en" elif "tr" in Language.lower() or Language.lower() == "tr": lang = "tr" else: lang = "en" #try: # https://github.com/searx/searx/tree/master/searx/engines #link = "https://api.qwant.com/v3/search/web?q=Hundenahrung&count=10&offset=0&locale=de_de" base_url = 'https://www.bing.com/' search_string = 'search?{query}&first=0' query = 'language:{} {}'.format(lang.upper(), query_string) search_path = search_string.format(query=urlencode({'q': query}), offset=0) website = base_url + search_path #print(website) html_page = self.getWebpage(website) #print(html_page) soup = BeautifulSoup(html_page, "lxml") mcounter = int(0) for link in soup.findAll('a'): myLink = str(link.get('href')) #myLink = str(link.text) if self.is_url(myLink) and myLink.find("bing.com") == -1 and myLink.find("javascript:") == -1 and len(myLink) > 12 and not myLink.startswith('/') and (myLink.lower().startswith("http") or myLink.lower().startswith("https") ) and mcounter < maxCounter: #print("Link:",myLink) res_set.add(myLink) mcounter += int(1) #except Exception as a: # #print("htmlify",a) # pass return res_set def QuantAPI(self, MainKeyword, SubKeywords, Language, enableFastmode): enableFastmode = self.useFastmode(MainKeyword+" "+SubKeywords) res_set = set() addSearchMK = str() addSearchSK = str() maxCounter = int(10) # Quant gibt maximal 10 results her if enableFastmode: maxCounter = int(5) mk_words = self.count_words_regex(MainKeyword) sk_words = self.count_words_regex(SubKeywords) """ if (MainKeyword.count(" ") == 0 or mk_words == 0) and not enableFastmode: addSearchMK = self.getKnowledgeGraphMoreInfo(MainKeyword, Language) if (SubKeywords.count(" ") == 0 or sk_words == 0) and not enableFastmode: addSearchSK = self.getKnowledgeGraphMoreInfo(SubKeywords, Language) """ if mk_words >= 10: #addSearchSK_v2 = getKnowledgeGraphMoreInfo(SubKeywords, Language) #query_string = str(SubKeywords+", "+addSearchSK_v2) query_string = str(SubKeywords) elif sk_words >= 10: #addSearchMK_v2 = getKnowledgeGraphMoreInfo(MainKeyword, Language) #query_string = str(MainKeyword+", "+addSearchMK_v2) query_string = str(MainKeyword) else: #query_string = str(MainKeyword+", "+addSearchMK+", "+SubKeywords+", "+addSearchSK) query_string = str(MainKeyword+", "+SubKeywords) if "de" in Language.lower() or Language.lower() == "de": lang = "de_de" elif "en" in Language.lower() or Language.lower() == "en": lang = "en_us" elif "fr" in Language.lower() or Language.lower() == "fr": lang = "fr_fr" elif "es" in Language.lower() or Language.lower() == "es": lang = "es_es" elif "it" in Language.lower() or Language.lower() == "it": lang = "it_it" elif "ru" in Language.lower() or Language.lower() == "ru": lang = "us_en" elif "zh" in Language.lower() or Language.lower() == "zh" or "cn" in Language.lower() or Language.lower() == "cn": lang = "zh_cn" elif "pt" in Language.lower() or Language.lower() == "pt": lang = "pt_pt" elif "jp" in Language.lower() or Language.lower() == "jp": lang = "us_en" elif "hi" in Language.lower() or Language.lower() == "hi" or "in" in Language.lower() or Language.lower() == "in": lang = "us_en" elif "ar" in Language.lower() or Language.lower() == "ar" or "sa" in Language.lower() or Language.lower() == "sa": lang = "us_en" elif "tr" in Language.lower() or Language.lower() == "tr": lang = "us_en" else: lang = "us_en" #try: # https://github.com/searx/searx/tree/master/searx/engines #link = "https://api.qwant.com/v3/search/web?q=Hundenahrung&count=10&offset=0&locale=de_de" website = "https://api.qwant.com/v3/search/web?q="+query_string+"&count=10&offset=0&locale="+lang #print(website) html_page = self.getWebpage(website) if self.is_json(html_page): #print("isjson") search_results = json.loads(html_page) data = search_results.get('data', {}) mcounter = int(0) if search_results.get('status') == 'success': mainline = data.get('result', {}).get('items', {}).get('mainline', {}) if not mainline: return res_set for row in mainline: mainline_type = row.get('type', 'web') if mainline_type != 'web': continue mainline_items = row.get('items', []) for item in mainline_items: #title = item.get('title', None) myLink = item.get('url', "") #if not textify.isDomainBlacklisted(myLink) and len(myLink) > 12 and not myLink.lower().endswith(".pdf") and (myLink.lower().startswith("http") or myLink.lower().startswith("https") ): #if len(myLink) > 12 and (myLink.lower().startswith("http") or myLink.lower().startswith("https") ) and mcounter < maxCounter: #print("/api.qwant.com Adding Website:",str(myLink)) res_set.add(myLink) mcounter += int(1) #except Exception as a: # #print("htmlify",a) # pass return res_set def DuckduckgoAPI(self, MainKeyword, SubKeywords, Language, enableFastmode): #print("DuckduckgoAPI") res_set = set() addSearchMK = str() addSearchSK = str() maxCounter = int(50) if enableFastmode: maxCounter = int(5) mk_words = self.count_words_regex(MainKeyword) sk_words = self.count_words_regex(SubKeywords) if mk_words >= 10: #addSearchSK_v2 = getKnowledgeGraphMoreInfo(SubKeywords, Language) #query_string = str(SubKeywords+", "+addSearchSK_v2) query_string = str(SubKeywords) elif sk_words >= 10: #addSearchMK_v2 = getKnowledgeGraphMoreInfo(MainKeyword, Language) #query_string = str(MainKeyword+", "+addSearchMK_v2) query_string = str(MainKeyword) else: #query_string = str(MainKeyword+", "+addSearchMK+", "+SubKeywords+", "+addSearchSK) query_string = str(MainKeyword+", "+SubKeywords) # https://api.duckduckgo.com/api # https://duckduckgo.com/params # https://help.duckduckgo.com/duckduckgo-help-pages/results/syntax/ if "de" in Language.lower() or Language.lower() == "de": lang = "de-de" elif "en" in Language.lower() or Language.lower() == "en": lang = "us-en" elif "fr" in Language.lower() or Language.lower() == "fr": lang = "fr-fr" elif "es" in Language.lower() or Language.lower() == "es": lang = "es-es" elif "it" in Language.lower() or Language.lower() == "it": lang = "it-it" elif "ru" in Language.lower() or Language.lower() == "ru": lang = "ru-ru" elif "zh" in Language.lower() or Language.lower() == "zh" or "cn" in Language.lower() or Language.lower() == "cn": lang = "cn-zh" elif "pt" in Language.lower() or Language.lower() == "pt": lang = "pt-pt" elif "jp" in Language.lower() or Language.lower() == "jp": lang = "jp-jp" elif "hi" in Language.lower() or Language.lower() == "hi" or "in" in Language.lower() or Language.lower() == "in": lang = "in-en" elif "ar" in Language.lower() or Language.lower() == "ar" or "sa" in Language.lower() or Language.lower() == "sa": lang = "xa-ar" elif "tr" in Language.lower() or Language.lower() == "tr": lang = "tr-tr" else: lang = "us-en" #print("DuckduckgoAPI fetch") #try: #website = "https://html.duckduckgo.com/html/?q="+query_string+"&lang="+lang # https://help.duckduckgo.com/features/safe-search/ -> &kp=1 (strict), &kp=-1 (moderate - don't show explicit results), or &kp=-2 (off) #website = "https://html.duckduckgo.com/html/?q="+query_string+"&lang="+lang+"&kl="+lang+"&kp=1&kaf=1&t=artikelschreiber.com" #"&lang=de-de&kl=de-de" website = "https://duckduckgo.com/html?q="+query_string+"&kl="+lang+"&ia=web&kp=1&kaf=1&t=artikelschreiber.com" html_page = self.getWebpage(website) #print("DuckduckgoAPI html",website) soup = BeautifulSoup(html_page, "lxml") mcounter = int(0) for link in soup.findAll('a', attrs={'class' : 'result__url'}): #myLink = str(link.get('href')) myLink = str(link.text) #print("DuckduckgoAPI Link:",myLink) if len(myLink) > 12 and not myLink.startswith('/') and (myLink.lower().startswith("http") or myLink.lower().startswith("https") ) and mcounter < maxCounter: res_set.add(myLink) mcounter += int(1) #except Exception as a: # #print("htmlify",a) # pass return res_set def getWebpage(self, link): #ftfy.fix_text(text, *, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC', max_decode_length=1000000) if isinstance(link, str) and (link.lower().startswith("http") or link.lower().startswith("https")): # use python request library for fetching #print("getWebpage(self, link)",link) try: with requests.get(link, headers=HeadersSimple, timeout=60, verify=False, allow_redirects=True) as r1: #keep alive #r1 = requests.get(link, headers=HeadersSimple, timeout=6, verify=False) #r1.encoding = r1.apparent_encoding #r1.encoding = 'utf-8' #r1.encoding = 'latin-1' myStatus = r1.status_code myText = str(r1.text) myContent = str(r1.headers['content-type']) myText = myText.replace('\n', ' ') myText = myText.replace("\n", ' ') mT1 = myText.strip() #print("getWebpage(self, link),Status Code:",myStatus," -> ",link) if myStatus == 200: #print("htmlify.getWebpage() Webpage size HTTP:", str(myStatus)," -> ", len(myText)," -> ", myContent) return str(mT1) else: return str("") except Exception as er: #print("Unexpected error: getWebpage(link)", sys.exc_info()[0]) #exc_type, exc_obj, exc_tb = sys.exc_info() #fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] #print(exc_type, fname, exc_tb.tb_lineno) return str("") pass #print("htmlify.getWebpage(link): Empty HTML Document!") return str("") def isLowQualityContent(self, text): # Um so höher die Zahl bzw. der Counter, um so schlechter der Score mycounter = int(0) for stopword in low_quality_content: if self.isFound(text, stopword): #print("Stopword found:",stopword) mycounter += int(1) #True, False return mycounter def isLowQualityContentText(self, text): # Um so höher die Zahl bzw. der Counter, um so schlechter der Score mycounter = False for stopword in low_quality_content: if self.isFound(text, stopword): #print("Stopword found:",stopword) mycounter = True # schlechte Ergebnisse, dh. geringwertiger Content #True, False return mycounter def remove_control_characters1(self, s): return re1.sub(r'\p{C}', '', s) def isFound(self, text, stopword): # return True: Stopwort in Text gefunden # return False: Stopwort nicht in Text gefunden #print("Text:'"+text+"' und Stopword:'"+stopword+"'") t_text_org = str(text) text = str(text.lower().strip("\n")) text_len = len(text) stopword = str(stopword).lower() space_stopword = str(" ")+str(stopword)+str(" ") #https://www.pythontutorial.net/python-regex/python-regex-word-boundary/ matches1 = re.finditer(r'\b'+stopword+'\b', text) for match in matches1: if match: return True if text.count(space_stopword) >= 1: #if space_stopword.find(text) != -1: #print("isBlacklisted() text.find() Match:",text,ele) return True if stopword == t_text_org.lower(): return True if stopword in text: return True if "_" in text or "-" in text and text_len >= 2 and text_len < 6: #print("Text:'"+text+"' und Stopword:'"+stopword+"'") if text.find(stopword) != -1: return True return False def get_dictionary_by_key_value(self, dictionary, target_key, target_value): """Return a dictionary that contains a target key value pair. Args: dictionary: Metadata dictionary containing lists of other dictionaries. target_key: Target key to search for within a dictionary inside a list. target_value: Target value to search for within a dictionary inside a list. Returns: target_dictionary: Target dictionary that contains target key value pair. """ for key in dictionary: if len(dictionary[key]) > 0: for item in dictionary[key]: if item[target_key] == target_value: return item def get_title(self, html): """Scrape page title.""" #title = str("") #import os, sys try: #if html.title.string: # title = html.title.string for title in html.find_all('title'): return str(title.get_text()) except Exception as as1: #print("Catching Exception at get_title(self, html):", e) #print("Unexpected error:", sys.exc_info()[0]) #exc_type, exc_obj, exc_tb = sys.exc_info() #fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] #print(exc_type, fname, exc_tb.tb_lineno) pass try: if html.find("meta", property="og:title"): description = html.find("meta", property="og:title").get('content') return self.extractTextFromHTML(description) except Exception as as1: #print("Catching Exception at get_title(self, html):", e) #print("Unexpected error:", sys.exc_info()[0]) #exc_type, exc_obj, exc_tb = sys.exc_info() #fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] #print(exc_type, fname, exc_tb.tb_lineno) pass try: if html.find("meta", property="twitter:title"): description = html.find("meta", property="twitter:title").get('content') return self.extractTextFromHTML(description) except Exception as as1: #print("Catching Exception at get_title(self, html):", e) #print("Unexpected error:", sys.exc_info()[0]) #exc_type, exc_obj, exc_tb = sys.exc_info() #fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] #print(exc_type, fname, exc_tb.tb_lineno) pass try: if html.find("h1"): title = html.find("h1").string return title except Exception as as1: #print("Catching Exception at get_title(self, html):", e) #print("Unexpected error:", sys.exc_info()[0]) #exc_type, exc_obj, exc_tb = sys.exc_info() #fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] #print(exc_type, fname, exc_tb.tb_lineno) pass try: if html.find_all("h1"): title = html.find_all("h1")[0].string return title except Exception as as1: #print("Catching Exception at get_title(self, html):", e) #print("Unexpected error:", sys.exc_info()[0]) #exc_type, exc_obj, exc_tb = sys.exc_info() #fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] #print(exc_type, fname, exc_tb.tb_lineno) pass try: if title: title = title.split('|')[0].strip() #title = title.split('-')[0].strip() except Exception as as1: #print("Catching Exception at get_title(self, html):", e) #print("Unexpected error:", sys.exc_info()[0]) #exc_type, exc_obj, exc_tb = sys.exc_info() #fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] #print(exc_type, fname, exc_tb.tb_lineno) pass return str("") def get_description(self, html): #description = str("") try: if html.find("meta", property="og:description"): description = html.find("meta", property="og:description").get('content') return description except Exception as as1: pass try: if html.find("meta", property="description"): description = html.find("meta", property="description").get('content') return description except Exception as as1: pass try: if html.find("meta", property="twitter:description"): description = html.find("meta", property="twitter:description").get('content') return description except Exception as as1: pass try: if html.find("p"): description = html.find("p").get('content') return self.extractTextFromHTML(description) except Exception as as1: pass return str("") def get_image(self, html, web_url): """Scrape share image.""" #image = str("") try: if html.find("meta", property="image"): image = html.find("meta", property="image").get('content') li = urljoin(web_url, image) return li except Exception as as1: pass try: if html.find("meta", property="og:image"): image = html.find("meta", property="og:image").get('content') li = urljoin(web_url, image) return li except Exception as as1: pass try: if html.find("meta", property="twitter:image"): image = html.find("meta", property="twitter:image").get('content') li = urljoin(web_url, image) return li except Exception as as1: pass try: if html.find_all("img", src=True): image = html.find_all("img") if image: image = html.find_all("img")[0].get('src') li = urljoin(web_url, image) return li except Exception as as1: pass return str("") def get_video(self, html): #og_video = str("") try: og_video = html.find("meta", property="og:video").get('content') return og_video #og_language2 = html.find("meta", attrs={'property':'og:locale'}).get('content') except Exception as as1: pass return str("") def get_meta_keywords(self, soup, text, url): listKeywordBlogs = ["BlogPosting", "NewsArticle", "Article", "CreativeWork", "Organization","Product"] # https://practicaldatascience.co.uk/data-science/how-to-scrape-schemaorg-metadata-using-python try: base_url = get_base_url(text, url) metadata = extruct.extract(text, base_url=base_url, uniform=True, syntaxes=['json-ld', 'microdata', 'opengraph']) except Exception as as1: pass for element in listKeywordBlogs: try: kw = get_dictionary_by_key_value(metadata, "@type", element) return str(kw['keywords']) except Exception as as1: pass try: kwords = str(soup.find("meta", attrs={'name':'keywords'}).get('content')) if len(kwords) > 21: return kwords except Exception as as1: pass return str("") def get_og_locale(self, html): og_language = str("") LANGUAGE = str("") try: og_language = html.find("meta", property="og:locale").get('content') #og_language2 = html.find("meta", attrs={'property':'og:locale'}).get('content') except Exception as as1: pass if self.isFound(og_language, "en"): LANGUAGE = "en" if self.isFound(og_language, "fr"): LANGUAGE = "fr" if self.isFound(og_language, "es"): LANGUAGE = "es" if self.isFound(og_language, "it"): LANGUAGE = "it" if self.isFound(og_language, "ar") or self.isFound(og_language, "sa"): LANGUAGE = "ar" if self.isFound(og_language, "tr"): LANGUAGE = "tr" if self.isFound(og_language, "ru"): LANGUAGE = "ru" if self.isFound(og_language, "zh") or self.isFound(og_language, "cn"): LANGUAGE = "zh" if self.isFound(og_language, "de"): LANGUAGE = "de" if self.isFound(og_language, "pt"): LANGUAGE = "pt" if self.isFound(og_language, "jp") or self.isFound(og_language, "ja"): LANGUAGE = "ja" if self.isFound(og_language, "in") or self.isFound(og_language, "hi"): LANGUAGE = "in" if len(LANGUAGE) == 2: return str(LANGUAGE) try: #og_language1 = html.find("meta", property="og:locale").get('content') og_language = html.find("meta", attrs={'property':'og:locale'}).get('content') except Exception as as1: pass if self.isFound(og_language, "en"): LANGUAGE = "en" if self.isFound(og_language, "fr"): LANGUAGE = "fr" if self.isFound(og_language, "es"): LANGUAGE = "es" if self.isFound(og_language, "it"): LANGUAGE = "it" if self.isFound(og_language, "ar") or self.isFound(og_language, "sa"): LANGUAGE = "ar" if self.isFound(og_language, "tr"): LANGUAGE = "tr" if self.isFound(og_language, "ru"): LANGUAGE = "ru" if self.isFound(og_language, "zh") or self.isFound(og_language, "cn"): LANGUAGE = "zh" if self.isFound(og_language, "de"): LANGUAGE = "de" if self.isFound(og_language, "pt"): LANGUAGE = "pt" if self.isFound(og_language, "jp") or self.isFound(og_language, "ja"): LANGUAGE = "ja" if self.isFound(og_language, "in") or self.isFound(og_language, "hi"): LANGUAGE = "in" if len(LANGUAGE) == 2: return str(LANGUAGE) try: # # og_language = html.find("html").attrs.get('lang') except Exception as as1: pass if self.isFound(og_language, "en"): LANGUAGE = "en" if self.isFound(og_language, "fr"): LANGUAGE = "fr" if self.isFound(og_language, "es"): LANGUAGE = "es" if self.isFound(og_language, "it"): LANGUAGE = "it" if self.isFound(og_language, "ar") or self.isFound(og_language, "sa"): LANGUAGE = "ar" if self.isFound(og_language, "tr"): LANGUAGE = "tr" if self.isFound(og_language, "ru"): LANGUAGE = "ru" if self.isFound(og_language, "zh") or self.isFound(og_language, "cn"): LANGUAGE = "zh" if self.isFound(og_language, "de"): LANGUAGE = "de" if self.isFound(og_language, "pt"): LANGUAGE = "pt" if self.isFound(og_language, "jp") or self.isFound(og_language, "ja"): LANGUAGE = "ja" if self.isFound(og_language, "in") or self.isFound(og_language, "hi"): LANGUAGE = "in" if len(LANGUAGE) == 2: return str(LANGUAGE) return str("") def get_links(self, html, web_url): links = set() for link in html.findAll('a'): link_href = link.get('href') if not self.isFound(link_href,"javascript"): li = urljoin(web_url, link_href) # https://stackoverflow.com/questions/44001007/scrape-the-absolute-url-instead-of-a-relative-path-in-python links.add(li) return list(links) def count_words_regex(self, text): # Counting words with regular expressions -> https://datagy.io/python-count-words/#:~:text=%23-,Counting%20words,-with%20regular%20expressions return len(re.findall(r'\w+', text)) # https://practicaldatascience.co.uk/data-science/how-to-identify-internal-and-external-links-using-python#:~:text=Determine%20whether%20the%20link%20is%20internal%20or%20external def is_internal(self, url, domain): if not bool(urlparse(url).netloc): return True elif url.startswith(domain): return True else: return False #is_external def isBlacklisted(self, text, stoplist): # return True: Stopwort in Blackliste gefunden # return False: Stopwort nicht in Blackliste gefunden text = str(text.lower().strip("\n")) for x in range(len(stoplist)): ele2 = str(stoplist[x].lower().strip("\n")) if not ele2.startswith("#") and len(ele2) >= 1: # print("Ele:",ele2) if self.isFound(text, ele2): # print("Stopword isFound:'",ele2,"'") return True if ele2.find(text) != -1 or text.find(ele2) != -1: # print("Stopword is -find-:'",ele2,"'") return True return False def currency_symbols(self, text: str, repl: str = "_CUR_") -> str: """Replace all currency symbols in ``text`` with ``repl``.""" return RE_CURRENCY_SYMBOL.sub(repl, text) def emails(self, text: str, repl: str = "_EMAIL_") -> str: """Replace all email addresses in ``text`` with ``repl``.""" return RE_EMAIL.sub(repl, text) def emojis(self, text: str, repl: str = "_EMOJI_") -> str: """ Replace all emoji and pictographs in ``text`` with ``repl``. Note: If your Python has a narrow unicode build ("USC-2"), only dingbats and miscellaneous symbols are replaced because Python isn't able to represent the unicode data for things like emoticons. Sorry! """ return RE_EMOJI.sub(repl, text) def hashtags(self, text: str, repl: str = "_TAG_") -> str: """Replace all hashtags in ``text`` with ``repl``.""" return RE_HASHTAG.sub(repl, text) def numbers(self, text: str, repl: str = "_NUMBER_") -> str: """Replace all numbers in ``text`` with ``repl``.""" return RE_NUMBER.sub(repl, text) def phone_numbers(self, text: str, repl: str = "_PHONE_") -> str: """Replace all phone numbers in ``text`` with ``repl``.""" return RE_PHONE_NUMBER.sub(repl, text) def urls(self, text: str, repl: str = "_URL_") -> str: """Replace all URLs in ``text`` with ``repl``.""" return RE_SHORT_URL.sub(repl, RE_URL.sub(repl, text)) def user_handles(self, text: str, repl: str = "_USER_") -> str: """Replace all (Twitter-style) user handles in ``text`` with ``repl``.""" return RE_USER_HANDLE.sub(repl, text) def preprocess_text(self, x): x = str(x) #x = currency_symbols(x, " ") x = self.emails(x, " ") x = self.phone_numbers(x, " ") x = self.emojis(x, " ") x = self.hashtags(x, " ") x = self.user_handles(x, " ") x = re.sub(r"\s{2,}", " ", x) # over spaces """ if isCode(x): x = currency_symbols(x, "") #x = urls(x, "") #x = x.encode("ascii", "ignore").decode() # unicode x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!? x = "" """ return str(x) def isCode(self, text): myCode = ["=","{","}",":","|","$","_","(",")",">","<"] myCount = 0 for c in myCode: myCount += text.count(c) if myCount > 3: return True # BAD: We have JavaScript Code return False # Good: We have NO JavaScript Code def count_spaces(self, s): return len(s) - len(s.strip()) def extractTextFromHTML(self, html): try: article.set_html(html) article.parse() return str(article.text) except Exception as as1: pass try: #soup = BeautifulSoup(html, "html5lib") #"lxml") # pip install -U html5lib # https://zetcode.com/python/beautifulsoup/ #soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "lxml") for tag in soup(): # CSS to remove attributes -> https://itecnote.com/tecnote/python-remove-all-inline-styles-using-beautifulsoup/ for attribute in ["class", "id", "name", "style"]: del tag[attribute] for data in soup(['style', 'script', 'comment']): # Remove tags -> https://www.geeksforgeeks.org/remove-all-style-scripts-and-html-tags-using-beautifulsoup/ data.decompose() myText = str(soup.find('body').text) # https://www.geeksforgeeks.org/find-the-text-of-the-given-tag-using-beautifulsoup/ return re.sub(r"\s{2,}", " ", myText) except Exception as as1: pass try: f = re.findall('\>(.*?)\<', html , re.MULTILINE | re.DOTALL) rList = list() lastStr = str("") for ele in f: ele = ele.strip() t = self.remove_control_characters1(ele) rList.append(t) return str(self.remove_text_inside_brackets(" ".join(rList))) except Exception as as1: pass return str("") def headline_extractor(self, headline, soup): # Hinweis: Wir befinden uns bei der Artikel Quelle sogut wie immer auf einer URI nicht auf einer URL, somit gehen wir davon aus, dass alle H3-Sub-Unterschriften genau einer H1-Überschrift übergeordnet sind tmp_cont_list = list() tmp_str = str("") resultLists = list() myTelerHeadlines = {} sit = soup.findAll(headline) headline_text = str("") for h1 in sit: h = h1.text.strip() # Überschrift tags = h1.find_next_siblings() # Alle Texte unterhalb dieser Überschrift headl_txt = self.preprocess_text(h) headl_txt_len = len(headl_txt) if headl_txt and headl_txt_len >= self.headline_min_length: headline_text = headl_txt for tag in tags: tag_content = str(tag.get_text()) # -> not working: extractTextFromHTML(str(tag).string) tag_content_len = len(tag_content) if tag_content and tag_content_len >= self.headline_min_length: count_space_int = self.count_spaces(tag_content) if count_space_int <= self.max_spaces_count and self.count_words_regex(tag_content) >= self.min_word_count and tag_content_len >= self.min_text_length: #print(h+" -> # \t\t Spaces:",count_space_int," -> "+tag_content+" -> Len:",tag_content_len) #headline_text = preprocess_text(h) tmp_str12 = self.preprocess_text(tag_content) tmp_cont_list.append(tmp_str12) tmp_str12 = str("") tmp_str = str(" ".join(tmp_cont_list)) if headline.lower() == "h1": myTelerHeadlines[headline] = headline_text myTelerHeadlines[headline+"_text"] = headline_text myTelerHeadlines[headline+"_text_low_quality_score"] = self.isLowQualityContent(headline_text) myTelerHeadlines[headline+"_text_ai"] = str("") # h1-Überschrift wird nie mittels KI umgewandelt myTelerHeadlines[headline+"_text_ai_plagscore"] = int(0) # h1-Überschrift wird nie mittels KI umgewandelt myTelerHeadlines[headline+"_text_till_end"] = headline_text myTelerHeadlines[headline+"_text_till_end_low_quality_score"] = self.isLowQualityContent(headline_text) resultLists.append(myTelerHeadlines) del myTelerHeadlines del tmp_str else: myTelerHeadlines[headline] = headline_text myTelerHeadlines[headline+"_text"] = tmp_str myTelerHeadlines[headline+"_text_low_quality_score"] = self.isLowQualityContent(tmp_str) myTelerHeadlines[headline+"_text_ai"] = str("") myTelerHeadlines[headline+"_text_ai_plagscore"] = int(0) myTelerHeadlines[headline+"_text_till_end"] = str("") myTelerHeadlines[headline+"_text_till_end_low_quality_score"] = int(0) resultLists.append(myTelerHeadlines) del myTelerHeadlines del tmp_str return resultLists def headline_to_html_end_extractor(self, headline, matching_text, soup): #soup = BeautifulSoup(html, "html5lib") # wir brauchen hier das html5lib # Code für H2-Headline bis zum finalen ""-Tag tmp_set = set() tmp_list = list() hit_found = False #[tag.name for tag in soup.find_all()] https://www.geeksforgeeks.org/get-all-html-tags-with-beautifulsoup/ for tag in soup.find_all(): tag_text = str(tag.text) tag_content = tag.prettify() # str(tag) oder tag.prettify() enthält das HTML -> https://stackoverflow.com/questions/25729589/how-to-get-html-from-a-beautiful-soup-object tag_name = str(tag.name) tag_text_preprocessed = preprocess_text(tag_text) #if self.isFound(tag_name, headline):#(headline in tag_name or headline == tag_name): # 1#print("'"+tag_name+"'---#headline#--->>>"+tag_text) #if self.isFound(tag_text, matching_text):# and self.isFound(tag_name, headline): # 1#print("'"+tag_name+"'---#isFound#--->>>"+tag_text) if self.isFound(tag_text, matching_text) and self.isFound(tag_name, headline): #print("'"+tag_name+"'---#isFound#--->>>"+tag_text) hit_found = True if hit_found and tag_text_preprocessed not in tmp_set and re.findall(r'\w+[.!?]$', tag_text_preprocessed, re.MULTILINE | re.DOTALL | re.IGNORECASE | re.UNICODE): tmp_list.append(tag_content) tmp_set.add(tag_text_preprocessed) # der Text soll nicht doppelt sein, jedes zugefügte Wort muss eine Satzendung haben [.!?] #print("Word with Sentence Ending:",tag_text) #print(tag.attrs) #print(hit_found) #print(matching_text) #sys.exit(1) pre_html = str(" ".join(tmp_list)) return self.extractTextFromHTML(pre_html) def generateBeautifulTitle(self, text): special_char_map = {ord('ä'):'ae', ord('ü'):'ue', ord('ö'):'oe', ord('ß'):'ss'} # https://stackoverflow.com/questions/2054746/how-to-search-and-replace-utf-8-special-characters-in-python result = re.sub(r'(\w{2,})\.(\w{2,})', 'ArtikelSchreiber.com', text) title = result.translate(special_char_map) title_split = list() if title.find("|") != -1: title_split = title.split("|") elif title.find(" - ") != -1: title_split = title.split(" - ") # we want this " Walking - The good choice?" and not those "4-Tage-Bart" #print("'-' split:",title_split) elif title.find(":") != -1: title_split = title.split(":") #print("':' split:",title_split) elif title.find("?") != -1: title_split = title.split("?") #print("'?' split:",title_split) elif title.find("&") != -1: title_split = title.split("&") #print("'&' split:",title_split) elif title.find(",") != -1: title_split = title.split(",") #print("',' split:",title_split) elif title.find(";") != -1: title_split = title.split(";") #print("';' split:",title_split) if len(title_split) >= 2: return str(title_split[0]).strip() else: return str(title) return str(text) def generateCanonicalURL(self, myTmpTitle, alternative_url, language): seo_lang_add = dict() seo_lang_add["de"] = str("artikel") seo_lang_add["en"] = str("article") seo_lang_add["it"] = str("articolo") seo_lang_add["es"] = str("articulo") seo_lang_add["fr"] = str("article") special_char_map = {ord('ä'):'ae', ord('ü'):'ue', ord('ö'):'oe', ord('ß'):'ss'} # https://stackoverflow.com/questions/2054746/how-to-search-and-replace-utf-8-special-characters-in-python regex = re.compile('[^a-zA-Z0-9 öäüßÖÄÜ]') ws_filter = re.compile(r"\s+") # multiple space " " to one space " " max_canonical_words = int(5) # maximum Website Title words in the final canonical url title_tmp = myTmpTitle title = ws_filter.sub(" ",title_tmp).strip() title_content = str("") title_split = list() if title.find(":") != -1: title_split = title.split(":") #print("':' split:",title_split) if title.find(" - ") != -1: title_split = title.split(" - ") # we want this " Walking - The good choice?" and not those "4-Tage-Bart" #print("'-' split:",title_split) if title.find("|") != -1: title_split = title.split("|") #print("'|' split:",title_split) if title.find("?") != -1: title_split = title.split("?") #print("'?' split:",title_split) if title.find("&") != -1: title_split = title.split("&") #print("'&' split:",title_split) if title.find(",") != -1: title_split = title.split(",") #print("',' split:",title_split) if title.find(";") != -1: title_split = title.split(";") #print("';' split:",title_split) if len(title_split) >= 2: title_content_1 = str(title_split[0]).strip() title_content_2 = str(title_split[1]).strip() #print("DEBUG title_content_1:",title_content_1) #print("DEBUG title_content_2:",title_content_2) title_content_1= ''.join(c for c in title_content_1 if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() # The isalpha() method returns True if all the characters are alphabet letters (a-z). title_content_1= regex.sub('', title_content_1).strip() # remove non letter and digits from string: https://stackoverflow.com/questions/22520932/python-remove-all-non-alphabet-chars-from-string title_content_2= ''.join(c for c in title_content_2 if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() # The isalpha() method returns True if all the characters are alphabet letters (a-z). title_content_2= regex.sub('', title_content_2).strip() # remove non letter and digits from string: https://stackoverflow.com/questions/22520932/python-remove-all-non-alphabet-chars-from-string #print("DEBUG title_content_1:",title_content_1) #print("DEBUG title_content_2:",title_content_2) title_content_1_word_list = list(title_content_1.split(" ")) title_content_2_word_list = list(title_content_2.split(" ")) title_content_1_word_count= len(title_content_1_word_list) title_content_2_word_count= len(title_content_2_word_list) title_content = title_content_1 if title_content_1_word_count < 6: #print("DEBUG title_content_1_word_count:",title_content_1_word_count) #print("DEBUG title_content_2_word_count:",title_content_2_word_count) max_additional_words= max_canonical_words - title_content_1_word_count #print("DEBUG max_additional_words:",max_additional_words) x_zero = int(0) for x in title_content_2_word_list: add_word = str("") if x_zero < max_additional_words: # maximal 5 wörter pro canonical url aus dem Titel der Webseite nehmen #try: add_word_tmp= str(title_content_2_word_list[x_zero]) # str(title_split[1]).strip() #print("DEBUG add_word_tmp:",add_word_tmp) t_split_more= ''.join(c for c in add_word_tmp if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() # The isalpha() method returns True if all the characters are alphabet letters (a-z). add_word = regex.sub('', t_split_more).strip() # remove non letter and digits from string: https://stackoverflow.com/questions/22520932/python-remove-all-non-alphabet-chars-from-string #except Exception as a1: # pass title_content += str(" ")+str(add_word) x_zero += int(1) #print("DEBUG title_content 2:",title_content) if len(title_content) < 1: # nimm einfach die ersten 5 wörter title_split = title.split(" ") title_content = alternative_url try: if len(title_split) >= 5: title_split[0] = ''.join(c for c in title_split[0] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() title_split[0] = regex.sub('', title_split[0]).strip() title_split[1] = ''.join(c for c in title_split[1] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() title_split[1] = regex.sub('', title_split[1]).strip() title_split[2] = ''.join(c for c in title_split[2] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() title_split[2] = regex.sub('', title_split[2]).strip() title_split[3] = ''.join(c for c in title_split[3] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() title_split[3] = regex.sub('', title_split[3]).strip() title_split[4] = ''.join(c for c in title_split[4] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() title_split[4] = regex.sub('', title_split[4]).strip() title_content = str(title_split[0]+" "+title_split[1]+" "+title_split[2]+" "+title_split[3]+" "+title_split[4]) elif len(title_split) >= 4: title_split[0] = ''.join(c for c in title_split[0] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() title_split[0] = regex.sub('', title_split[0]).strip() title_split[1] = ''.join(c for c in title_split[1] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() title_split[1] = regex.sub('', title_split[1]).strip() title_split[2] = ''.join(c for c in title_split[2] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() title_split[2] = regex.sub('', title_split[2]).strip() title_split[3] = ''.join(c for c in title_split[3] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() title_split[3] = regex.sub('', title_split[3]).strip() title_content = str(title_split[0]+" "+title_split[1]+" "+title_split[2]+" "+title_split[3]) elif len(title_split) >= 3: title_split[0] = ''.join(c for c in title_split[0] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() title_split[0] = regex.sub('', title_split[0]).strip() title_split[1] = ''.join(c for c in title_split[1] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() title_split[1] = regex.sub('', title_split[1]).strip() title_split[2] = ''.join(c for c in title_split[2] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() title_split[2] = regex.sub('', title_split[2]).strip() title_content = str(title_split[0]+" "+title_split[1]+" "+title_split[2]) elif len(title_split) >= 2: title_split[0] = ''.join(c for c in title_split[0] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() title_split[0] = regex.sub('', title_split[0]).strip() title_split[1] = ''.join(c for c in title_split[1] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() title_split[1] = regex.sub('', title_split[1]).strip() title_content = str(title_split[0]+" "+title_split[1]) elif len(title_split) >= 1: title_split[0] = ''.join(c for c in title_split[0] if not c.isalpha() or not c.isspace() or not c.isdigit()).strip() title_split[0] = regex.sub('', title_split[0]).strip() title_content = str(title_split[0]) except Exception as as1: return str(alternative_url) pass title_content = title_content.strip() title_content = str(title_content.lower()) #print("DEBUG title 1:",title_content) title_content = title_content.translate(special_char_map) #print("DEBUG title 2:",title_content) title_content = ws_filter.sub(" ",title_content).strip() title_content_final = str(unidecode(title_content).replace(" ","-")) canonical_url = "https://www.artikelschreiber.com/texts/"+str(seo_lang_add[language])+"-"+title_content_final+".html" return canonical_url def calculateRank(self, myAI_obj, url, session, ip_obj, serp_obj): #telerN1 = {'mainkeyword':MainKeyword, 'subkeyword':SubKeywords, 'articleraw':articleRaw, 'articletext':b_text, 'score':score, 'sessionid':SessionID, 'summary':p_summary, 'sprachprofil':p_sprachprofil, 'headline':article_headline, 'description':p_description,'articleurl':p_articleurl, 'metakeys':p_metakeys, 'addontext_json':myAddonText_json, 'language':Language, 'aitext_json':p_aitext_json} teler = dict()#{} points = int(0) if not isinstance(url, str): teler['points'] = int(-10) return teler html = self.getWebpage(url) if not isinstance(html, str): teler['points'] = int(-10) return teler social_found = False imp_found = False title = str("") description = str("") video = str("") language = str("") h_headline = str("") h_text = str("") h_text_ai = str("") h_text_till_end = str("") teler.update(ip_obj) # IP Informationen ins aktueller teler-objekt schreiben teler.update(serp_obj) # Wenn SERP Support aktiv, dann das hier auch aktivieren: teler['isGoodContent'] = False teler['openai'] = str("") teler['fastmode'] = False #teler['ip'] = str(ip_address) teler['url'] = str(url) teler['html'] = str(html) teler['session'] = str(session) teler["topics"] = str("") teler["detected_language"] = str("") teler["language"] = str("") teler["summary"] = str("") teler["title"] = str("") teler['title_raw'] = str("") teler["article_text"] = str("") teler["article_text_ai"] = str("") teler["article_text_ai_plagscore"] = int(0) teler["article_text_low_quality_score"] = int(0) teler['suggestions'] = list() teler['canonical'] = str("https://www.artikelschreiber.com/") teler['points'] = int(0) teler['ai_enabled'] = False teler["article_text_ai_raw"] = str("") teler["openai_raw"] = str("") teler["h1_text_ai_raw"] = str("") teler["h2_text_ai_raw"] = str("") teler["h3_text_ai_raw"] = str("") teler['alternative_url'] = str("https://www.artikelschreiber.com/") # Wenn das html genau HTML5 konform ist if self.validateHTML(html): points += int(1) # kurze Url if len(url) < 85: points += int(1) ####HTML Quelltext >200KB: -5p if len(html) < 200000: points += int(1) try: #soup = BeautifulSoup(html, "html5lib") #"lxml") # pip install -U html5lib # https://zetcode.com/python/beautifulsoup/ soup = BeautifulSoup(html, "lxml") except Exception as as1: soup = BeautifulSoup(html, "html.parser") for tag in soup(): # CSS to remove attributes -> https://itecnote.com/tecnote/python-remove-all-inline-styles-using-beautifulsoup/ for attribute in ["class", "id", "style"]: del tag[attribute] for data in soup(['style', 'script', 'comment']): # Remove tags -> https://www.geeksforgeeks.org/remove-all-style-scripts-and-html-tags-using-beautifulsoup/ data.decompose() for headline in self.headlines_list: resultLists = self.headline_extractor(headline, soup) if len(resultLists) > 0: for element in resultLists: try: h_headline = element.get(headline) h_text = element.get(headline+"_text") h_text_ai = element.get(headline+"_text_ai") h_text_low_quality_score = element.get(headline+"_text_low_quality_score") h_text_till_end_low_quality_score = element.get(headline+"_text_till_end_low_quality_score") teler[headline] = h_headline teler[headline+"_text"] = h_text teler[headline+"_text_low_quality_score"] = h_text_low_quality_score teler[headline+"_text_ai"] = h_text_ai teler[headline+"_text_till_end"] = str("") teler[headline+"_text_till_end_low_quality_score"] = h_text_till_end_low_quality_score if (self.isFound(headline,"h1") or self.isFound(headline,"h2") or self.isFound(headline,"h3")) and len(h_headline) >= self.headline_min_length and h_text_low_quality_score == 0: # bei H1,H2,H3 gibt es Bonuspunkte, wenn Headline lang genug ist und der Text Quality Score stimmt points += int(1) elif (self.isFound(headline,"h1") or self.isFound(headline,"h2") or self.isFound(headline,"h3")) and len(h_headline) < self.headline_min_length: # bei H1,H2,H3 gibt es Punktabzug, wenn die Headline zu kurz ist points -= int(1) elif (self.isFound(headline,"h4") or self.isFound(headline,"h5") or self.isFound(headline,"h6")) and len(h_headline) >= self.headline_min_length and h_text_low_quality_score == 0: # bei H4,H5,H6 gibt es Bonuspunkte, wenn Headline lang genung ist und der Text Quality Score stimmt points += int(1) elif (self.isFound(headline,"h4") or self.isFound(headline,"h5") or self.isFound(headline,"h6")) and len(h_headline) < self.headline_min_length: # bei bei H4,H5,H6 gibt es Punktabzug, wenn die Headline zu kurz ist points -= int(1) elif (self.isFound(headline,"h1") or self.isFound(headline,"h2") or self.isFound(headline,"h3")) and count_spaces(h_text) <= self.max_spaces_count and count_words_regex(h_text) >= self.min_word_count and len(h_text) >= self.min_text_length and h_text_low_quality_score == 0: # bei H1,H2,H3 gibt es Bonuspunkte, wenn alles OK ist points += int(1) elif not (self.isFound(headline,"h1") or self.isFound(headline,"h2") or self.isFound(headline,"h3")) and count_spaces(h_text) <= self.max_spaces_count and count_words_regex(h_text) >= self.min_word_count and len(h_text) >= self.min_text_length and h_text_low_quality_score == 0: # bei H4,H5,H6 gibt es Punktabzug, wenn etwas falsch ist points -= int(1) elif (self.isFound(headline,"h4") or self.isFound(headline,"h5") or self.isFound(headline,"h6")) and count_spaces(h_text) <= self.max_spaces_count and count_words_regex(h_text) >= self.min_word_count and len(h_text) >= self.min_text_length and h_text_low_quality_score == 0: # bei H4,H5,H6 gibt es Bonuspunkte, wenn alles OK ist points += int(1) elif not (self.isFound(headline,"h4") or self.isFound(headline,"h5") or self.isFound(headline,"h6")) and count_spaces(h_text) <= self.max_spaces_count and count_words_regex(h_text) >= self.min_word_count and len(h_text) >= self.min_text_length and h_text_low_quality_score == 0: # bei bei H1,H2,H3 gibt es Punktabzug, wenn etwas falsch ist points -= int(1) except Exception as as1: pass # ### # die eigentliche H-Textinhaltsblöcke bis zum Ende des gesamten Textes ### try: teler[headline+"_text_till_end"] = self.headline_to_html_end_extractor(headline, h_headline, soup) h_text_till_end = element.get(headline+"_text_till_end") h_text_till_end_low_quality_score = self.isLowQualityContent(h_text_till_end) teler[headline+"_text_till_end_low_quality_score"] = h_text_till_end_low_quality_score if self.count_spaces(h_text_till_end) <= self.max_spaces_count and self.count_words_regex(h_text_till_end) >= self.min_word_count and len(h_text_till_end) >= self.min_text_length and h_text_till_end_low_quality_score == 0: points += int(1) # h_text_till_end else: points -= int(1) except Exception as as1: pass try: #h = soup.title.text.strip() # soup.select('title')[0].text.strip() # print(soup.title) h = self.get_title(soup) title = str(self.preprocess_text(h)) if len(title) >= self.headline_min_length: teler['title_raw'] = str(h) teler['title'] = self.generateBeautifulTitle(title) points += int(1) else: teler['title'] = str("") # leer setzen wenn kein Inhalt except Exception as a1: teler['title'] = str("") # leer setzen wenn kein Inhalt pass try: h = self.get_description(soup) description = str(self.preprocess_text(h)) if len(description) >= 100 and len(description) < 154: teler['description'] = description points += int(1) else: teler['description'] = str("") # leer setzen wenn kein Inhalt except Exception as a1: teler['description'] = str("") # leer setzen wenn kein Inhalt pass try: h = self.get_image(soup, url) image = str(self.preprocess_text(h)) if len(image) >= self.headline_min_length: teler['image'] = image points += int(1) else: teler['image'] = str("") # leer setzen wenn kein Inhalt except Exception as a1: teler['image'] = str("") # leer setzen wenn kein Inhalt pass try: h = self.get_video(soup) video = str(self.preprocess_text(h)) if len(video) >= self.headline_min_length: teler['video'] = video points += int(1) else: teler['video'] = str("") # leer setzen wenn kein Inhalt except Exception as a1: teler['video'] = str("") # leer setzen wenn kein Inhalt pass try: og_language = self.get_og_locale(soup) #og_language = str(self.preprocess_text(h)) og_language_len = len(og_language) if og_language_len >= 2 and og_language_len < self.headline_min_length: teler['language'] = str("") teler['language'] = og_language points += int(1) else: teler['language'] = str("") except Exception as a1: teler['language'] = str("") pass try: h = self.get_links(soup, url) if len(h) > 0: teler['links'] = h else: teler['links'] = list() except Exception as a1: teler['links'] = list() pass try: h = self.get_meta_keywords(soup, html, url) h1 = h.replace(", ",";").strip() h1 = h1.replace(" ,",";").strip() h1 = h1.replace(",",";").strip() if len(h1) > 0: teler['keywords'] = h1 points += int(1) elif len(h) > 0: teler['keywords'] = h points += int(1) else: teler['keywords'] = str("") # leer setzen wenn kein Inhalt except Exception as a1: pass ### # Social Links finden ### teler['socials'] = False for socials in social_links: if self.isFound(html, socials): teler['socials'] = True points += int(1) break ### # Wichtige Keywords finden ("Impressum") finden ### teler['important'] = False for imp1 in imp_keywords: if self.isFound(html, imp1): teler['important'] = True points += int(1) break try: article_text = self.extractTextFromHTML(html) article_processed = str(self.preprocess_text(article_text)) article_score = self.isLowQualityContent(article_processed) """ h1_text = teler.get("h1_text") h1_score = teler.get("h1_low_quality_score") h2_text = teler.get("h2_text") h2_score = teler.get("h2_low_quality_score") h3_text = teler.get("h3_text") h3_score = teler.get("h3_low_quality_score") myText = str("") if article_score == 0: myText += article_text if h1_score == 0: myText += h1_text if h2_score == 0: myText += h2_text if h3_score == 0: myText += h3_text #if len(myText) < self.min_text_length: #myText = article_text+" "+h1_text+" "+h2_text+" "+h3_text article_processed = str(self.preprocess_text(article_text)) mylow_quality_score = self.isLowQualityContent(article_processed) # hier ist der komplette Artikel drin - Fallback """ teler["article_text"] = article_processed teler["article_text_low_quality_score"] = mylow_quality_score if self.count_spaces(article_processed) <= self.max_spaces_count and self.count_words_regex(article_processed) >= self.min_word_count and len(article_processed) >= self.min_text_length and mylow_quality_score == 0: points += int(1) else: points -= int(1) except Exception as a1: pass try: myText = str("") article_text = teler.get("article_text") article_score = teler.get("article_text_low_quality_score") h1_text = teler.get("h1_text") h1_score = teler.get("h1_low_quality_score") h2_text = teler.get("h2_text") h2_score = teler.get("h2_low_quality_score") h3_text = teler.get("h3_text") h3_score = teler.get("h3_low_quality_score") if article_score == 0: myText += article_text if h1_score == 0: myText += h1_text if h2_score == 0: myText += h2_text if h3_score == 0: myText += h3_text if len(myText) < self.min_text_length: myText = article_text+" "+h1_text+" "+h2_text+" "+h3_text teler["summary"] = self.doLsaSummarizer(myText) summary_text = teler.get("summary") if self.count_spaces(summary_text) <= self.max_spaces_count and self.count_words_regex(summary_text) >= self.min_word_count and len(summary_text) >= self.min_text_length and mylow_quality_score == 0: points += int(1) else: points -= int(1) except Exception as a1: pass try: myText = str("") article_text = teler.get("article_text") article_score = teler.get("article_text_low_quality_score") h1_text = teler.get("h1_text") h1_score = teler.get("h1_low_quality_score") h2_text = teler.get("h2_text") h2_score = teler.get("h2_low_quality_score") h3_text = teler.get("h3_text") h3_score = teler.get("h3_low_quality_score") if article_score == 0: myText += article_text if h1_score == 0: myText += h1_text if h2_score == 0: myText += h2_text if h3_score == 0: myText += h3_text if len(myText) < self.min_text_length: myText = article_text+" "+h1_text+" "+h2_text+" "+h3_text teler["detected_language"] = self.detectTextLanguage(myText) except Exception as a1: teler["detected_language"] = str("") pass try: myText = str("") article_text = teler.get("article_text") article_score = teler.get("article_text_low_quality_score") h1_text = teler.get("h1_text") h1_score = teler.get("h1_low_quality_score") h2_text = teler.get("h2_text") h2_score = teler.get("h2_low_quality_score") h3_text = teler.get("h3_text") h3_score = teler.get("h3_low_quality_score") if article_score == 0: myText += article_text if h1_score == 0: myText += h1_text if h2_score == 0: myText += h2_text if h3_score == 0: myText += h3_text if len(myText) < self.min_text_length: myText = article_text+" "+h1_text+" "+h2_text+" "+h3_text og_language = teler.get("language") detected_language = teler.get("detected_language") if len(og_language) == 2: teler["topics"] = myAI_obj.topicModeling(myText, og_language) else: teler["topics"] = myAI_obj.topicModeling(myText, detected_language) article_topics = teler.get("topics") #pprint(article_topics) if len(article_topics) >= self.headline_min_length*10: # 70 Zeichen points += int(1) else: points -= int(1) except Exception as a1: teler["topics"] = str("") pass try: suggestions = list() nodouble = set() article_keywords = teler.get("keywords") article_topics = teler.get("topics") if len(article_keywords) >= self.headline_min_length-4: # somit 3 Zeichen lang topics = article_keywords.split(';') for word_text in topics: w_len = len(word_text) if word_text.strip() not in nodouble and w_len > 3: suggestions.append("mk="+word_text.strip()) nodouble.add(word_text.strip()) if len(article_topics) >= self.headline_min_length-4: topics = article_topics.split(';') for word_text in topics: w_len = len(word_text) if word_text.strip() not in nodouble and w_len > 3: suggestions.append("mk="+word_text.strip()) nodouble.add(word_text.strip()) teler['suggestions'] = list(suggestions) # wichtig, weil Sets nicht json serialisiert werden können except Exception as a1: pass try: #alternative_url "canonical fallback link" Link: topo = teler.get("topics") sessionShort = str(session[:3]).lower() topic1 = list(topo.lower().split(";")) if len(topic1) >= 2: topic = str(self.remove_accents(topic1[0])) rand = str(self.remove_accents(topic1[1])) rand2 = str(self.remove_accents(topic1[2])) #points += int(1) elif len(topic1) >= 1: topic = str(self.remove_accents(topic1[0])) rand = str(self.remove_accents(topic1[1])) rand2 = str(self.remove_accents(topic1[1])) #points += int(1) elif len(topic1) >= 0: topic = str(self.remove_accents(topic1[0])) rand = str(self.remove_accents(topic1[0])) rand2 = str(self.remove_accents(topic1[0])) #points -= int(1) else: topic = str(self.remove_accents("blog")) rand = str(self.remove_accents("entry")) rand2 = str(self.remove_accents("id")) #points -= int(1) og_language = teler.get("language") detected_language = teler.get("detected_language") if len(og_language) == 2: language = og_language else: language = detected_language # old: filename = str(topic+"-"+rand+"-"+rand2+"-"+sessionShort).lower()+".html" filename = str("article-"+topic+"-"+rand+"-"+str(language).lower()+"-"+sessionShort).lower()+".html" # old: absolute_canonical = "https://www.artikelschreiber.com/"+str(language)+"/blog/"+filename teler['alternative_url'] = "https://www.artikelschreiber.com/texts/"+filename except Exception as a1: teler['alternative_url'] = str("https://www.artikelschreiber.com/") pass try: #canonical Link: alternative_url = teler.get("alternative_url") og_language = teler.get("language") detected_language = teler.get("detected_language") if len(og_language) == 2: language = og_language else: language = detected_language myTmpTitle = teler.get("title") teler['canonical'] = self.generateCanonicalURL(myTmpTitle, alternative_url, language) points += int(2) except Exception as a1: teler['canonical'] = alternative_url points += int(1) pass try: # content/quellcode verhältnis zwischen 40 und 70 Prozent? article_text = teler.get("article_text") if len(article_text)/210000 >= 0.4 and len(article_text)/210000 < 0.71: points += int(1) except Exception as as1: pass try: ####HTML hat Google oder Bing Ads: +5p if html.find("pagead2.googlesyndication.com/pagead/js/adsbygoogle.js") != -1: points += int(1) except Exception as as1: pass try: ####Strukturierte Daten vorhanden im HTML: +5p if html.find("application/ld+json") != -1: points += int(1) except Exception as as1: pass try: # Wenn Text zuviele Zeichen enthält, die kein Buchstabe oder Zahl ist article_text = teler.get("article_text") bchar = self.countBadChars(text) ltext = len(article_text) bcharScore = float("{0:.3f}".format((bchar/ltext)*100)) if bcharScore >= 3.2: points -= int(1) except Exception as as1: pass try: # Wenn Text eine Paywall ist if self.isPaywall(html): points -= int(10) except Exception as as1: pass try:####
      im html dann: +5, zuerst
        inhalt finden, dann dort separat li elemente auslesen ul = re.findall("
          (.*?)
        ", html, re.IGNORECASE) ol = re.findall("
          (.*?)
        ", html, re.IGNORECASE) for t in list(ul + ol): if self.isFound("
      1. ", t): points += int(1) break except Exception as as1: pass """ #TODO###Kurze Sätze mit 9 bis 13 Wörtern: +5p """ # Session: b459d00a59a0aca81c7415947e03746f-ARTIKELSCHREIBER_de.json enthält sehr viele falsch geschriebene Buchstaben, so ein Mist muss rausgefiltert werden und einen Rank von 0 bekommen! content_text_special_chars = teler.get("h1_text")+teler.get("h2_text")+teler.get("h3_text")+teler.get("h4_text")+teler.get("h5_text")+teler.get("article_text") count_broken_chars = self.count_broken_character(content_text_special_chars) isLowQualityContent = self.isLowQualityContentText(content_text_special_chars) if count_broken_chars > 20 or isLowQualityContent: teler['points'] = int(0) else: teler['points'] = points return teler