# -*- coding: utf-8 -*- #!/usr/bin/env python # https://developers.google.com/custom-search/docs/xml_results#countryCodes #https://www.linkedin.com/countserv/count/share?format=jsonp&url=https://www.buzzerstar.com # pip install --upgrade spacy tensorflow gensim sumy keras markovify google-api-python-client beautifulsoup4 #from sphinxapi import * import sys, time import hashlib import pickle #import argparse import json #import sys import re import nltk import os, sys, logging, re from nltk.corpus import stopwords from nltk.tokenize import RegexpTokenizer from nltk.stem.porter import PorterStemmer from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from nltk.corpus import stopwords from nltk.tokenize import PunktSentenceTokenizer from nltk.stem.snowball import SnowballStemmer #pip3 install stop-words from stop_words import get_stop_words stopwordsDE = get_stop_words('de') from nltk.corpus import stopwords cachedStopWords = stopwords.words("german") # load nltk's English stopwords as variable called 'stopwords' stopwords = nltk.corpus.stopwords.words('german') debug = False max_gensim_results = 23 import os import re import sys import codecs import string import time import glob import getopt import argparse from unidecode import unidecode from datetime import datetime as dTime from pprint import PrettyPrinter import math #from textblob import TextBlob as tb #from textblob_de import TextBlobDE as tb r_DateRegexList=list() r_ForbiddenRegexList=list() pSentenceDelimiter=re.compile("(\.|\?|\!)", re.IGNORECASE) a0=re.compile("(\)\.|\)\!|\)\?)", re.IGNORECASE) p01=re.compile("bzw\.", re.IGNORECASE) p0=re.compile("Rdn\.", re.IGNORECASE) p1=re.compile("Abs\.", re.IGNORECASE) p2=re.compile("Nr\.", re.IGNORECASE) p3=re.compile("Art\.", re.IGNORECASE) p4=re.compile("Aufl\.", re.IGNORECASE) p5=re.compile("vgl\.", re.IGNORECASE) p6=re.compile("Einf\.", re.IGNORECASE) p61=re.compile("ff\.", re.IGNORECASE) p62=re.compile("gem\.", re.IGNORECASE) p63=re.compile("Buchst\.", re.IGNORECASE) p64=re.compile("Dipl\.-Ing\.", re.IGNORECASE) p65=re.compile("Dipl\.", re.IGNORECASE) p66=re.compile("Ing\.", re.IGNORECASE) # neu: 30.6.2017 p67=re.compile("\"(\w{1,})\"", re.IGNORECASE) p68=re.compile("\"(\d{1,})\"", re.IGNORECASE) p691=re.compile("\'(\w{1,})\'", re.IGNORECASE) p692=re.compile("\'(\d{1,})\'", re.IGNORECASE) ''' p7=re.compile("(\d{1,})\.", re.IGNORECASE) p8=re.compile("(\w{1})\.", re.IGNORECASE) p9=re.compile("(\d{1,})\.(\d{1,})\.", re.IGNORECASE) p10=re.compile("(\w{1})\.(\w{1})\.", re.IGNORECASE) ''' p11=re.compile("(\d{1,2})\. (\w{3,}) (\d{2,})", re.IGNORECASE) # 14. April 2003 p12=re.compile("(\d{1,2})\.(\w{3,}) (\d{2,})", re.IGNORECASE) p13=re.compile("(\d{1,2})\.(\d{1,2}) (\d{2,})", re.IGNORECASE) p14=re.compile("(\d{1,2})\. (\d{1,2}) (\d{2,})", re.IGNORECASE) p15=re.compile("(\d{1,2})\. (\d{1,2})\. (\d{2,})", re.IGNORECASE) p16=re.compile("(\d{1,2})\.(\d{1,2})\.(\d{2,})", re.IGNORECASE) p17=re.compile("(\d{1,2})\. (\w{2,})", re.IGNORECASE) p18=re.compile("(\d{1,2})\.(\w{2,})", re.IGNORECASE) r_DateRegexList.append(p11) r_DateRegexList.append(p12) r_DateRegexList.append(p13) r_DateRegexList.append(p14) r_DateRegexList.append(p15) r_DateRegexList.append(p16) r_DateRegexList.append(p17) r_DateRegexList.append(p18) r_ForbiddenRegexList.append(p01) r_ForbiddenRegexList.append(p0) r_ForbiddenRegexList.append(p1) r_ForbiddenRegexList.append(p2) r_ForbiddenRegexList.append(p3) r_ForbiddenRegexList.append(p4) r_ForbiddenRegexList.append(p5) r_ForbiddenRegexList.append(p6) ''' r_ForbiddenRegexList.append(p7) r_ForbiddenRegexList.append(p8) r_ForbiddenRegexList.append(p9) r_ForbiddenRegexList.append(p10) ''' r_ForbiddenRegexList.append(p11) r_ForbiddenRegexList.append(p12) r_ForbiddenRegexList.append(p13) r_ForbiddenRegexList.append(p14) r_ForbiddenRegexList.append(p15) r_ForbiddenRegexList.append(p16) r_ForbiddenRegexList.append(p61) r_ForbiddenRegexList.append(p62) r_ForbiddenRegexList.append(p63) r_ForbiddenRegexList.append(p64) r_ForbiddenRegexList.append(p65) r_ForbiddenRegexList.append(p66) # neu: 30.6.2017 #r_ForbiddenRegexList.append(p67) #r_ForbiddenRegexList.append(p68) #r_ForbiddenRegexList.append(p691) #r_ForbiddenRegexList.append(p692) #quelle: https://openjur.de/s/abkuerzungen.html p692=re.compile("a\.A\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("a\.a\.O\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("a\.E\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("a\.F\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("abgedr\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("abheb\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("abl\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("ABl\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Abs\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("abw\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Alt\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Anh\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Anm\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("arg\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Art\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Aufl\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Bd\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("begl\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("begr\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Beil\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Beschl\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Bespr\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Best\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("bestr\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("betr\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Bfg\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("BfgGer\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Bl\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("bzw\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("c\.i\.c\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("d\.h\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("dagg\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("dch\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("ders\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("h\.M\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("i\.ü\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("i\.d\.F\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("i\.d\.R\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("i\.E\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("i\.e\.S\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("i\.gl\.S\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("i\.Grds\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("i\.R\.d\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("i\.R\.v\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("i\.S\.v\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("i\.V\.m\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("i\.w\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("i\.w\.S\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("jur\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("krit\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("lfd\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("lt\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("m\. Anm\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("m\. Bespr\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("m\.w\.N\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("m\.W\.v\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("mdl\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("mtl\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("n\.F\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("n\.n\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("n\.rk\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("n\.w\.N\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("o\.J\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("ord\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Rdnr\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Rdz\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("rel\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Rev\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Ri\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Rspr\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("s\.a\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("s\.d\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("skept\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("sof\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("sog\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("st\. Rspr\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("str\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("u\.a\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("u\.ä\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("umstr\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("unbek\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("unbest\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("unpfb\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("unzul\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("unzustd\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Urt\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("v\.A\.w\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Var\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Verf\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Verh\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Verk\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Verz\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Vfg\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("vgl\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Vorbem\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("vorgen\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("zusf\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("zust\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("zutr\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("zw\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("\.{2,}", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("\!{2,}", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("\?{2,}", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("ca\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Prof\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("Dr\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) p692=re.compile("med\.", re.IGNORECASE) r_ForbiddenRegexList.append(p692) muster_content_abkuerzung = "mustererkennung_abkuerzung.txt" muster_content_gericht = "mustererkennung_gericht.txt" muster_content_gesetz = "mustererkennung_gesetz.txt" lexikon_content = "lexikon_uniq.txt" mustererkennung_gericht = open(muster_content_gericht, 'r', encoding='latin-1').read() mustererkennung_gesetz = open(muster_content_gesetz, 'r', encoding='latin-1').read() mustererkennung_abkuerzung = open(muster_content_abkuerzung, 'r', encoding='latin-1').read() lexikon_lesen = open(lexikon_content, 'r', encoding='latin-1').read() t_musterListGericht = mustererkennung_gericht.split("\n") t_musterListGesetz = mustererkennung_gesetz.split("\n") t_lexikonList = lexikon_lesen.split("\n") t_musterListGericht = map(str.strip, t_musterListGericht) t_musterListGesetz = map(str.strip, t_musterListGesetz) t_lexikonList = map(str.strip, t_lexikonList) def isMusterLexikonFound(text): if not isinstance(text, str): return "" for ele in t_lexikonList: ele = ele.strip() if ele.startswith("#") or len(ele)<1: continue if ele in text: return ele if re.search(ele,text): return ele k = re.compile(r'\b%s\b' % ele) if k.search(text): return ele return "" def isMusterGesetzFound(text): if not isinstance(text, str): return "" for ele in t_musterListGesetz: ele = ele.strip() if ele.startswith("#") or len(ele)<1: continue # idee: # if " "+ele+" " in text: #if ele in text: # kostet kaum rechenkraft k = re.compile(r'\b%s\b' % ele) # kostet enorm rechenkraft if k.search(text): return ele return "" def isMusterGerichtFound(text): if not isinstance(text, str): return "" for ele in t_musterListGericht: ele = ele.strip() if ele.startswith("#") or len(ele)<1: continue #if ele in text: # idee: # if " "+ele+" " in text: #if re.search(ele, text): k = re.compile(r'\b%s\b' % ele) # kostet enorm rechenkraft if k.search(text): return ele return "" def isLexikonSimple(text): lexikon=set(); ele = text.strip() sentElements=ele.split() for s in sentElements: r_String=isMusterLexikonFound(s) if len(r_String)>0: #print("Result:", r_String, " s->:", s) lexikon.add(r_String) return lexikon def isGerichtSimple(text): gericht=set(); ele = text.strip() sentElements=ele.split() for s in sentElements: r_String=isMusterGerichtFound(s) if len(r_String)>0: #print("Result:", r_String, " s->:", s) gericht.add(r_String) return gericht def isGesetzSimple(text): gesetz=set(); ele = text.strip() sentElements= ele.split() for s in sentElements: r_String=isMusterGesetzFound(s) if len(r_String)>0: #print("Result:", r_String, " s->:", s) gesetz.add(r_String) return gesetz def only_numerics(seq): seq_type= type(seq) return seq_type().join(filter(seq_type.isdigit, seq)) def only_letter(seq): seq_type= type(seq) return seq_type().join(filter(seq_type.isalpha, seq)) def isParagraphSimple(text): """ § 3 EStG § 2 Abs 1 EStG §§ 2 und 3 EStG § 22 Nr 5 EStG § 38a Abs 1 Satz 3 Einkommenssteuergesetz § 2 Abs 7 Satz 2 SozR 4-7837 § 1 Nr 4 § 2 Abs 1 Satz 1 Nr 1 bis 4 EStG Allgemein: Paragraph Nummer Wort(kurz oder lang) """ paragraph=set(); t_musterListGesetz=mustererkennung_gesetz.split("\n") sentElements=text.strip() #r = re.compile("\§ \d{1,6} \w{2,}") matches = re.findall("\§ \d{1,6} \w{2,}", sentElements) #newlist = filter(r.match, t_musterListGesetz) #matches = re.findall("\§ \d{1,6}\s\w+", sentElements) #m = re.search("\§ \d \w", sentElements) if matches: #print("Cur Sent:", sentElements) for match in matches: match=match.strip() #print('Found = {}'.format(match)) if "Abs" not in match: t_ssplit=match.split() r_last=t_ssplit[-1] r_last=only_letter(r_last) for tmV1 in t_musterListGesetz: if r_last in tmV1 and len(r_last)>1 and len(tmV1)>1 and len(r_last) == len(tmV1): #print("Adding:",match ) paragraph.add(match) if "Abs" in match or "Nr" in match: r_Index=sentElements.index(match) r_Content=sentElements[r_Index:len(sentElements)] r_split=r_Content.split() #print("Position:", r_Index) #print("Position Content:", r_Content) for tm in t_musterListGesetz: tm = tm.strip() #print("Mustererkennung:", tm) for rw in r_split: rw = rw.strip() #rw = only_numerics(rw) rw = only_letter(rw) #print("texterkennung:",tm,"->",rw) if rw in tm and len(rw)>1 and len(tm)>1 and len(rw) == len(tm): r_IndexV2=0 try: r_IndexV2=sentElements.index(rw) except Exception as e: 1 if r_IndexV2 > r_Index: r_ContentV2=sentElements[r_Index:r_IndexV2] r_Fullmatch=r_ContentV2+" "+tm #print("AdvMatch: ", rw, " at Position:", r_IndexV2) #print("Position Content:", r_Content) #print("FullMatch:", r_Fullmatch) #paragraph.add("DEBUG: "+r_Fullmatch) match=match.replace("Abs", "") match=match.replace("Nr", "") paragraph.add(match+tm) return paragraph def encodeToLatin1(text): #n_String=replaceUmlauts(text) encResults = text.encode('utf-8', "ignore") #encResults = text.encode('utf-8', "ignore") return str(encResults.decode('latin-1', "ignore")) def encodeToAscii(text): #n_String=replaceUmlauts(text) encResults = text.encode('utf-8', "ignore") #encResults = text.encode('utf-8', "ignore") return str(encResults.decode('ascii', "replace")) def encodeToUTF8Adv(text): encResults = text.encode('utf-8', "ignore") #return str(encResults.decode('latin-1', "ignore")) return str(encResults.decode('utf-8', "remove")) def isDate(word): isDateElement=False for r in r_DateRegexList: y = r.search(word) if y is not None: isDateElement=True return isDateElement def isSentenceEnding(word): isSentenceEndingElement=False y = pSentenceDelimiter.search(word) y2 = a0.search(word) p1 = re.compile("(\w{2,})", re.IGNORECASE) x1 = p1.search(word) if x1 is not None and len(word)>=2 and y is not None and y2 is not None: """ print("#######################################") print("x1",x1) print("len(word)",len(word)) print("y",y) print("y2",y2) print("Sentence Ending Plus:", word) """ return True else: """print("#######################################") print("x1",x1) print("len(word)",len(word)) print("y",y) print("y2",y2) print("Sentence Ending Minus:", word) """ 1 if y is not None: isSentenceEndingElement=True if y2 is not None: isSentenceEndingElement=True for r in r_ForbiddenRegexList: x = r.search(word) if x is not None and y2 is None: # if we have something like "laufen (S.E.0)" #print("isSentenceEndingElement:",x) #plainWriteAppend("/tmp/splitter.txt","Found Negativ Match:"+x+" "+word) return False if isSentenceEndingElement is True and len(word)>2: #plainWriteAppend("/tmp/splitter.txt","Positive SentenceEnding Match:"+word) return True elif isSentenceEndingElement is True and word in " ": #plainWriteAppend("/tmp/splitter.txt","Positive SentenceEnding Match:"+word) return True else: #print("word ending false:",word) #plainWriteAppend("/tmp/splitter.txt","Negativ SentenceEnding Match:"+word) return False def mySentenceSplitter(text): words = text.split() oneSent="" allSent=list() #for w in words: for x in range(0, len(words)-1): c=words[x-1] a=words[x] b=words[x+1] w=a v=c+" "+a+" "+b oneSent_c=oneSent+" "+c+" "+w oneSent=oneSent+" "+w #oneSent=oneSent+" ;;;"+v #and isDate(v) is False if isSentenceEnding(w) is True and isDate(v) is False: #oneSent=oneSent+"{}".format(isDate(v))+"

" #oneSent=oneSent+" "+"

" y1 = pSentenceDelimiter.search(w) y2 = a0.search(w) mystr=w lastchar=mystr[-1:] lastchar=lastchar.strip() last4char=mystr[-1:] last4char=lastchar.strip() sentence_delimiter=['.','!','?'] #if y1 is None or y2 is None or ('.' not in lastchar or '!' not in lastchar or '?' not in lastchar): # no sentence delimiter #if y1 is None or y2 is None or lastchar.find("\.") == -1 or lastchar.find("\!") == -1 or lastchar.find("\?") == -1: b_count=0 # bad count: drei mal darf der sentence_delimiter nicht gefunden werden for s in sentence_delimiter: if lastchar == s: """ print("isSentenceEnding(true) w:",w) print("isSentenceEnding(true) s:",s) print("isSentenceEnding(true) lastchar:",lastchar) oneSent_c=oneSent_c+"\n" allSent.append(oneSent_c) oneSent_c="" words[x-1]="" words[x]="" """ else: """ print("isSentenceEnding(true) not w:",w) print("isSentenceEnding(true) not s:",s) print("isSentenceEnding(true) lastchar:",lastchar) """ b_count=b_count+1 #print("b_count:",b_count) if b_count == len(sentence_delimiter): #oneSent_c=oneSent_c+"\n" #print("Sentence (bad ending):",oneSent_c) #print("Sentence (bad ending) x:",x) #allSent.append(oneSent_c) #oneSent_c="" #words[x-1]="" #words[x]="" 1 #elif b_count == 2 and len(w)>=3: # 1 else: oneSent=oneSent+" "+"\n" #print("Sentence (good ending):",oneSent) #print("Sentence (good ending) x:",x) allSent.append(oneSent) oneSent="" #exit(1) return allSent def topicModeling(text): #print(type(text)) #print(len(text)) #if len(text) < 150: # return float(0.0) words = [] texts = [] text2 = mySentenceSplitter(text) # list for tokenized documents in loop #words = [w for w in text2 if not w in stopwords or not w in stopwordsDE] for w in text2: a=w.split() for t in a: if t.lower() in stopwords: continue if t.lower() in stopwordsDE: continue if t in stopwords: continue if t in stopwordsDE: continue if len(t)<3: continue if not t.isalnum(): continue if t.lower() in cachedStopWords: continue if t in cachedStopWords: continue #if not sentify.isNotBlacklisted(t): # continue tv1=encodeToUTF8Adv(t) words.append(tv1) # Number of trainings epochs num_epochs = 5 # Number of topics to show num_topics_my=23 # Number of threads to run in parallel num_workers=8 # Context window size minimum_probability_my=0.75#0.55 texts=words # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary([words]) #convert the dictionary to a bag of words corpus for reference corpus = [dictionary.doc2bow(t) for t in [words]] if debug: print("gensim.topicModeling(): Start MultiCore Topic Modelling") ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics=num_topics_my, id2word=dictionary, chunksize=30000, passes=num_epochs, workers=num_workers) b = ldamodel.get_document_topics(corpus, minimum_probability=minimum_probability_my, per_word_topics=False) count = 0 c = set() for ele in b: for e in ele: c.add(e) resultSet = set() resultList = [] d = sorted(c, reverse=True, key=lambda x: x[1]) if debug: print("gensim.topicModeling(): Parsing Results of Topic Modelling") for e in d: document_id = e[0] f = ldamodel.show_topic(document_id, topn=15) #print(f) for f_e in f: word = f_e[0]#encodeToUTF8Adv(f_e[0]) if len(resultList) >= max_gensim_results: #print("gensim.topicModeling():", resultList) return resultList if word not in resultSet: resultList.append(word) resultSet.add(word) if debug: print("gensim.topicModeling():", resultList) return resultList """ def mySentenceSplitterStringReturn(text): from sumy.models.dom._sentence import Sentence from sumy.nlp.tokenizers import Tokenizer words = text.split() oneSent="" allSent=list() allReturn="" #for w in words: for x in range(0, len(words)-1): c=words[x-1] a=words[x] b=words[x+1] w=a v=c+" "+a+" "+b oneSent=oneSent+" "+w #oneSent=oneSent+" ;;;"+v #and isDate(v) is False if isSentenceEnding(w) is True and isDate(v) is False: my_o=oneSent s = Sentence(my_o, Tokenizer("german")) allReturn+=my_o+" "+"\n" #oneSent=oneSent+"{}".format(isDate(v))+"

" #oneSent=oneSent+" "+"

" oneSent=oneSent+" "+"\n" allSent.append(s) oneSent="" return allSent """ def convertStr(s): """Convert string to either int or float.""" try: ret = int(s) except ValueError: #Try float. ret = float(s) return ret