from datetime import datetime import errno import os import requests import re import json import MySQLdb as mdb from MySQLdb import escape_string import urllib.request import requests # pip3 install --upgrade requests from urllib.parse import urlparse# pip3 install --upgrade urlparse import urllib3 # url=urllib.unquote(url).decode('utf8') urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) from weasyprint import HTML # https://weasyprint.readthedocs.io/en/stable/tutorial.html import sys, unicodedata, re import datetime import langid # https://github.com/saffsd/langid.py from langdetect import detect # also https://github.com/saffsd/langid.py #import deepl # https://github.com/freundTech/deepl-cli import six import uuid #from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.summarizers.reduction import ReductionSummarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words import json #pip3 install -U pydeepl import spacy # See "Installing spaCy" import justext # pip3 install -U justext import pydeepl # pip3 install -U pydeepl #import textwrap # pip3 install -U textwrap #from googletrans import Translator #pip3 install -U googletrans # pip3 install -U google-cloud-translate # pip3 install -U google-cloud-storage #translator = Translator() nlp_de = spacy.load('de_core_news_sm') nlp_en = spacy.load('en_core_web_sm') nlp_de.max_length = 1000000 nlp_en.max_length = 1000000 nlp_allowed = [u"NN",u"NNP",u"NNPS",u"PROPN",u"NOUN",u"NE", u"NNE"] verb_allowed = [u"VMFIN", u"VMINF", u"VMPP", u"VVFIN", u"VVIMP", u"VVINF", u"VVIZU", u"VVPP", u"VERB"] # https://www.tutorialspoint.com/How-to-trim-down-non-printable-characters-from-a-string-in-Python # Get all unicode characters all_chars = (chr(i) for i in range(sys.maxunicode)) # Get all non printable characters control_chars = ''.join(c for c in all_chars if unicodedata.category(c) == 'Cc') # Create regex of above characters control_char_re = re.compile('[%s]' % re.escape(control_chars)) re_pattern = re.compile(u'[^\u0000-\uD7FF\uE000-\uFFFF]', re.UNICODE) UserAgent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" UserAgentMobile = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" Headers = {'user-agent': UserAgent, 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate'} HeadersSimple = {'user-agent': UserAgentMobile, 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate'} HeadersSimpleADV = {'user-agent': UserAgentMobile, 'Authorization': 'ce78143f444846d14d338f0da26a2434', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate'} # curl -X GET -H "Authorization: ce78143f444846d14d338f0da26a2434" "https://free.donreach.com/shares?providers=facebook,google,twitter,linkedin,xing&url=http://9gag.com/" ## csvsql --db mysql://root:###########99@localhost:3306/SAMYSTOCKS --tables shareprices_daily --insert /home/samystocks/simfin/regular_download/2019-10-14/us-shareprices-daily.csv bonusList = ["dollar","euro","$","€","revenue","money","growth","company","Earning","Price","Rating","Debt","Equity","Return","margin","Gross margin","Operating margin","Net margin","EBITDA","Cash flow margin","Return on assets","Return on equity","Return on invested capital","cashflow","assets","capital","turnover","Interest","income","Payout","ROE","investor","stock","invest","investment","dividend","cash","flow"] stigmaList = ["XBRL"] #print("todo: mysql aktivieren und json ins sql schreiben") #exit(1) def _is_wordlike(tok): return tok.orth_ and tok.orth_[0].isalpha() def sentence_division_suppresor(doc): """Spacy pipeline component that prohibits sentence segmentation between two tokens that start with a letter. Useful for taming overzealous sentence segmentation in German model, possibly others as well.""" for i, tok in enumerate(doc[:-1]): if _is_wordlike(tok) and _is_wordlike(doc[i + 1]): doc[i + 1].is_sent_start = False return doc nlp_de.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser') nlp_en.add_pipe(sentence_division_suppresor, name='sent_fix', before='parser') def translateText(txttranslate1, target): textList = wrap(txttranslate1, 4500) myFinalReturnText = str("") for txttranslate in textList: #def translateText(text, target, model=translate.NMT): translation="" #translate_client = translate.Client() if isinstance(txttranslate, six.binary_type): txttranslate = txttranslate.decode('utf-8') #try: # Text can also be a sequence of strings, in which case this method # will return a sequence of results for each text. #result = translate_client.translate(text, target_language=target, model=model) subscriptionKey = 'eaac938f51ab405998eab07017b0bb8f' subscriptionKey = 'ab4e516e288146f88d6b6cb001171d12' # If you encounter any issues with the base_url or path, make sure # that you are using the latest endpoint: https://docs.microsoft.com/azure/cognitive-services/translator/reference/v3-0-translate base_url = 'https://api.cognitive.microsofttranslator.com' path = '/translate?api-version=3.0' #params = '&to=de&to=it' params = '&to='+target.lower() constructed_url = base_url + path + params headers = { 'Ocp-Apim-Subscription-Key': subscriptionKey, 'Content-type': 'application/json', 'X-ClientTraceId': str(uuid.uuid4()) } # You can pass more than one object in body. body = [{ 'text' : txttranslate }] request = requests.post(constructed_url, headers=headers, json=body) resp = request.json() response = resp[0] #print(type(response)) #print(response['translatedText']['text']) #print(request) #json1_data = json.loads(response[0]) #print(json1_data) for key,val in response.items(): if key == "translations": for key2,val2 in response[key][0].items(): #print(key2, " --- >", val2) if key2 == "text": #return val2 #response[key][0][key2] myFinalReturnText = myFinalReturnText +" "+ val2 return myFinalReturnText def translateTextDeepL(targetLang, text): translation="" try: translation, extra_data = deepl.translate(text, target=targetLang) except Exception as e: print("translateTextDeepL(): DeepL Translation failed: ", e) print("Unexpected error:", sys.exc_info()[0]) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) return translation def google_translate(myText,toLanguage): # https://github.com/GoogleCloudPlatform/python-docs-samples/tree/master/translate/cloud-client # [START translate_quickstart] # Imports the Google Cloud client library from google.cloud import translate # Instantiates a client translate_client = translate.Client() # The text to translate text = myText # The target language target = toLanguage # Translates some text into Russian translation = translate_client.translate( text, target_language=target) # print(u'Text: {}'.format(text)) # print(u'Translation: {}'.format(translation['translatedText'])) # [END translate_quickstart] return translation['translatedText'] def wrap(s, w): """ :param s: str; source string :param w: int; width to split on """ return [s[i:i + w] for i in range(0, len(s), w)] def split_sentences(text): """ rList = list() lFlag = detectTextLanguage(text) if lFlag == "de": nlp_de.max_length = len(text) + 1 doc = nlp_de(text) else: nlp_en.max_length = len(text) + 1 doc = nlp_en(text) for sent in doc.sents: rList.append(str(sent)) #return TAG_RE.sub('', text) #return re.split(r'(?<=[^A-Z].[.!?]) +(?=[A-Z])', text)#, re.MULTILINE) #####DER WAR DER BESTE: return re.split(r'(?<=[^A-Z\{\}].[.!?]) +(?=[A-Z])', text)#, re.MULTILINE) #return [s.strip() for s in re.split('[\.\?!]' , text) if s] return rList """ rList = list() nlp_en.max_length = len(text) + 1 doc = nlp_en(text) for sent in doc.sents: rList.append(str(sent)) return rList def doLsaSummarizer(text): """ LANGUAGE = "german" if "en" in Language.lower(): #LANGUAGE_DE = "german" LANGUAGE = "english" """ LANGUAGE = "english" SENTENCES_COUNT = 85 parser = PlaintextParser.from_string(text,Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summarizer.null_words = get_stop_words(LANGUAGE) summarizer.bonus_words = bonusList summarizer.stigma_words = stigmaList contentText = str("") s_count = 0 for sentence in summarizer(parser.document, SENTENCES_COUNT): if s_count <= SENTENCES_COUNT: s_sent = str(sentence) contentText=contentText+s_sent+" " s_count+=1 return contentText def doReductionSummarizer(text, Language): text = beautifyUpperLowercase(text) LANGUAGE = "german" if "en" in Language.lower(): #LANGUAGE_DE = "german" LANGUAGE = "english" SENTENCES_COUNT = 3 parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) #stemmer = Stemmer(LANGUAGE) summarizer = ReductionSummarizer(Stemmer(LANGUAGE)) summarizer.stop_words = get_stop_words(LANGUAGE) #summarizer.null_words = get_stop_words(LANGUAGE) #summarizer.bonus_words = [MainKeyword,SubKeywords] #summarizer.stigma_words = ["und", "der", "die", "das", "oder", "wie", "aber"] contentText="" s_count=0 for sentence in summarizer(parser.document, SENTENCES_COUNT): if s_count < SENTENCES_COUNT: s_sent = str(sentence) contentText=contentText+s_sent+" " s_count+=1 return contentText def unique(iterable): """ Returns a list copy in which each item occurs only once (in-order). """ seen = set() return [x for x in iterable if x not in seen and not seen.add(x)] def remove_control_chars(s): return control_char_re.sub('', s) def detectTextLanguage(text): text = str(text) languz1 = detectTextLanguage1(text) languz2 = detectTextLanguage2(text) if languz1 == languz2: return languz1.lower() else: return languz2.lower() return 'de' def detectTextLanguage2(text): # https://github.com/saffsd/langid.py lang ="de" text =str(text) langid.set_languages(['de','en','es','it','fr']) try: langList=langid.classify(text) #print(langList[0]) if lang in ['de','en','es','it','fr']: return langList[0].lower() except Exception as e: 1#print("Language Detection failed: ",e) return 'de' def detectTextLanguage1(text): lang ="de" text =str(text) try: lang=detect(text) except Exception as e: 1#print("Language Detection failed: ",e) if lang in ['de','en','es','it','fr']: return lang.lower() return 'de' """ Quelle: https://github.com/chiphuyen/lazynlp/blob/master/lazynlp/cleaner.py """ def replace_unprintable(txt): """Replace non-printable characters with printable characters """ unprint_file = "/home/unaique/library/blacklists/unprintable_chars.txt" printable = set(string1.printable) lines = open(os.path.join(dir_path, unprint_file), 'r').readlines() chars = {line.strip().split(':')[0]: line.strip().split(':')[1] for line in lines} return ''.join([c if c in printable else chars[c] for c in txt]) def remove_html_tags(text): """Remove html tags from a string""" clean = re.compile('<.*?>') return re.sub(clean, '', text) def preprocessorInternal(dom): "Removes unwanted parts of DOM." options = { "processing_instructions": False, "remove_unknown_tags": True, "safe_attrs_only": False, "page_structure": False, "annoying_tags": False, "frames": False, "meta": False, "links": False, "javascript": False, "scripts": True, "comments": True, "style": True, "embedded": True, "forms": True, "kill_tags": ("head",), } cleaner = Cleaner(**options) return cleaner.clean_html(dom) def parse_html(page): """ Clean HTML tags for webpages that aren't Gutenberg books # https://github.com/miso-belica/jusText/tree/dev/justext/stoplists """ #page = preprocessorInternal(page1) tmpPage = remove_html_tags(page) lang = detectTextLanguage(tmpPage) try: lang = lang.lower() #parts = justext.justext(page, justext.get_stoplist('English')) if lang == "de" or lang in "de": parts = justext.justext(page, justext.get_stoplist('German')) elif lang == "en" or lang == "us" or lang in "en" or lang in "us": parts = justext.justext(page, justext.get_stoplist('English')) else: parts = justext.justext(page, justext.get_stoplist('German')) except lxml.etree.ParserError as e: print('library.Htmlify() Page empty') return '' except UnicodeDecodeError as e: print("library.Htmlify() Can't decode utf-8") return '' paragraphs = [] for part in parts: if not part.is_boilerplate: paragraphs.append(part.text) return '\n\n'.join(paragraphs) def count_words(text): return len(text.split()) def removeDumpSentences(iList): rList = list() for e in iList: c = count_words(e) if len(e) > 30 and c > 5: rList.append(e) return "".join(rList) def encodeToLatin1(text): text = text.replace('ß','ss') encResults = text.encode('utf-8', "ignore") #encResults = text.encode('utf-8', "ignore") s_string = str(encResults.decode('latin-1', "ignore")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string def encodeToUTF8Adv(text): encResults = text.encode('utf-8', "ignore") #return str(encResults.decode('latin-1', "ignore")) s_string = str(encResults.decode('utf-8', "remove")) #textv1 = re_pattern.sub(u'\uFFFD', s_string) return s_string def encodeToUTF8(text): return text.encode('utf-8', "ignore") def insertEntryToMysql(p_company_name, p_company_description,p_investor_relations,p_company_link,p_symbol,p_isin,p_dividend_payer,p_dividend_history,p_trafficlight,p_smileytype,p_qualityscore_details,p_qualityscore,p_kurs,p_summary): a = datetime.datetime.now() """ myWordCloudJson = str(myWordCloudJson) p_simpletext = remove_control_chars(p_simpletext) MainKeyword = re_pattern.sub(u'\uFFFD', MainKeyword) p_simpletext = p_simpletext.encode('unicode_escape').decode('unicode_escape') p_timestamp = time.time() """ db = mdb.connect(host="localhost",user="root", passwd="###########99", db="SAMYSTOCKS", use_unicode=True, charset="utf8mb4") cursor = db.cursor() cursor.execute("SET NAMES utf8mb4"); cursor.execute("SET CHARACTER SET utf8mb4"); # Execute the SQL command sql = "INSERT INTO samystocks_webpage (p_company_name, p_company_description,p_investor_relations,p_company_link,p_symbol,p_isin,p_dividend_payer,p_dividend_history,p_trafficlight,p_smileytype,p_qualityscore_details,p_qualityscore,p_lastmodified,p_kurs,p_summary) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" p_lastmodified = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') try: cursor.execute(sql, (p_company_name, p_company_description,p_investor_relations,p_company_link,p_symbol,p_isin,p_dividend_payer,p_dividend_history,p_trafficlight,p_smileytype,p_qualityscore_details,p_qualityscore,p_lastmodified,p_kurs,p_summary)) db.commit() # disconnect from server cursor.close() except Exception as e: print("Error %d: %s" % (e.args[0],e.args[1])) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) ## return -1 ##except: ## # Rollback in case there is any error ## cursor.rollback() b = datetime.datetime.now() delta = b - a print("DBify do dbify.insertCacheToMysql(): Processing finished after:", delta) return 1 def getWebpagesSimple(link): if link.lower().startswith(("http", "https", "ftp", "ftps")): #print("getWebpagesSimple():", link) try: r1 = requests.get(link, headers=HeadersSimple, timeout=5, verify=False) myText = r1.text myText = myText.replace('\n', ' ') myText = myText.replace("\n", ' ') if len(myText) >= 100: return myText.strip() except Exception as er: #print("Unexpected error: getWebpagesSimple(link)", sys.exc_info()[0]) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] #print(exc_type, fname, exc_tb.tb_lineno) try: request = urllib3.Request(link, headers=HeadersSimple, timeout=5, context=ssl._create_unverified_context()) contents = urllib3.urlopen(request).read() contents = contents.replace('\n', ' ') contents = contents.replace("\n", ' ') if len(contents) >= 100: return contents.strip() except Exception as er: #print("Unexpected error: getWebpagesSimple(link)", sys.exc_info()[0]) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] #print(exc_type, fname, exc_tb.tb_lineno) #print("htmlify.getWebpagesSimple(link): Empty HTML Document!") return str("") def getCompanyName(input): db = mdb.connect("localhost","root","###########99","SAMYSTOCKS") # prepare a cursor object using cursor() method cursor = db.cursor() try: sql = "SELECT Company_Name, IndustryId FROM `companies` WHERE `Ticker` = '"+input+"' Limit 1;" # Execute the SQL command cursor.execute(sql) data = cursor.fetchall() returnData = [x[0] for x in data] for r in data: if r: s=str(r[0]) s=s.replace('"','') return s except: 1 db.close() return str("") def getCompanyIndustry(input): db = mdb.connect("localhost","root","###########99","SAMYSTOCKS") # prepare a cursor object using cursor() method cursor = db.cursor() try: sql = "SELECT Company_Name, IndustryId FROM `companies` WHERE `Ticker` = '"+input+"' Limit 1;" # Execute the SQL command cursor.execute(sql) data = cursor.fetchall() for r in data: if r: s=str(r[1]) #s=s.replace('"','') return s except: 1 db.close() return str("") def getSektor(input): db = mdb.connect("localhost","root","###########99","SAMYSTOCKS") # prepare a cursor object using cursor() method cursor = db.cursor() try: #SELECT Industry FROM `industries` WHERE IndustryId=IndustryId Limit 1 sql = "SELECT Industry FROM `industries` WHERE IndustryId='"+input+"' Limit 1;" # Execute the SQL command cursor.execute(sql) data = cursor.fetchall() returnData = [x[0] for x in data] for r in data: if r: s=str(r[0]) s=s.replace('"','') #s=s.replace('"','') return s except: 1 db.close() return str("") def getKurs(input): db = mdb.connect("localhost","root","###########99","SAMYSTOCKS") # prepare a cursor object using cursor() method cursor = db.cursor() resultsList = list() try: #SELECT Industry FROM `industries` WHERE IndustryId=IndustryId Limit 1 sql = "SELECT Date,Close FROM `shareprices_daily` WHERE `Ticker` ='"+input+"' Limit 20000;" # Execute the SQL command cursor.execute(sql) data = cursor.fetchall() #returnData = [x[0] for x in data] for r in data: if r: d=str(r[0]) k=str(r[1]) e = {d:k} resultsList.append(e) #s=s.replace('"','') #return r except: 1 db.close() j=json.dumps(resultsList) return j def getDividende(input): db = mdb.connect("localhost","root","###########99","SAMYSTOCKS") # prepare a cursor object using cursor() method cursor = db.cursor() resultsList = list() try: #SELECT Industry FROM `industries` WHERE IndustryId=IndustryId Limit 1 sql = "SELECT Date,Dividend FROM `shareprices_daily` WHERE `Ticker` ='"+input+"' AND Dividend > 0 Limit 20000;" # Execute the SQL command cursor.execute(sql) data = cursor.fetchall() #returnData = [x[0] for x in data] for r in data: if r: d=str(r[0]) k=str(r[1]) e = {d:k} resultsList.append(e) #s=s.replace('"','') #return r except: 1 db.close() j=json.dumps(resultsList) return j def getSource(input, year): db = mdb.connect("localhost","root","###########99","SAMYSTOCKS") # prepare a cursor object using cursor() method cursor = db.cursor() resultsList = list() try: #SELECT Industry FROM `industries` WHERE IndustryId=IndustryId Limit 1 sql = "SELECT `Source`,`Fiscal Period` FROM `balance_full_quarterly` WHERE `Ticker` = '"+input+"' AND `Fiscal Year` ='"+year+"' Limit 8;" # Execute the SQL command cursor.execute(sql) data = cursor.fetchall() #print(data) for r in data: if r: s=str(r[0]) d=str(r[1]) s=s.replace('"','') resultsList.append({d:s}) #s=s.replace('"','') #return s except: 1 db.close() return resultsList #t=translateText("I am a good guy walking through the woods.", "DE") #print(t) #exit(1) # https://dividendenfluss.de/dividenden-koenige-liste-2018-26-aktien-mit-ueber-50-jahre-dividenden-wachstum/ # Beschreibungstext: https://simfin.com/data/companies/89661 # ISIN: https://www.google.com/search?q=KO+isin # Investor Relations: https://www.google.com/search?q=KO+investor+relations # Übersetzer: https://www.deepl.com/translator#en/de/ p_symbol ="KO" # Großschreibung myYear ="2019" p_isin ="US1912161007" #myCompany_description = "Microsoft Corp is a technology company. It develops, licenses, and supports a wide range of software products and services. Its business is organized into three segments: Productivity and Business Processes, Intelligent Cloud, and More Personal Computing." p_company_description ="Coca-Cola Co ist ein Unternehmen für alkoholfreie Getränke, das eine Vielzahl von kohlensäurehaltigen und kohlensäurefreien Marken herstellt, darunter Coca-Cola, Diet Coke, Fanta, Sprite, Minute Maid, Powerade und Dasani." p_investor_relations ="https://www.coca-colacompany.com/investors" p_company_link ="https://www.cocacola.de/de/home/" p_dividend_payer ="1" p_trafficlight ="2" # 0:rot, 1:gelb, 2:grün p_smileytype ="1" # 1: yes, 0: no p_qualityscore = "-1"; p_qualityscore_details = "-1" #Company name: SELECT Company_Name, IndustryId FROM `companies` WHERE `Ticker` = symbol Limit 1 p_company_name =getCompanyName(p_symbol) print("Company Name:"+p_company_name) print() industryID=getCompanyIndustry(p_symbol) #print("Industry ID:"+industryID) #print() s=getSektor(industryID) print("Sektor:"+s) print() p_kurs=getKurs(p_symbol) #print("Kurs:"+len(p_kurs)) #print("Kurs:"+p_kurs) print() p_dividend_history=getDividende(p_symbol) #print("Dividende:"+p_dividend_history) print() ms=getSource(p_symbol, myYear) #ms1=getSource(p_symbol, "2018") myDownloadDir = "/home/samystocks/data_installer/temporar/"+p_symbol.lower() if not os.path.exists(myDownloadDir): os.makedirs(myDownloadDir) resultsList2 = list() for myListkey in ms: for key in myListkey: #print("KEY:",key) #print("URL",myListkey[key]) qu =key url =myListkey[key] h =getWebpagesSimple(url) l =h.split(">") c =0 for ele in l: c = c+1 if ele.find("10-Q") != -1: hit = l[c+1] if hit.find(".htm") != -1: s = hit.replace('