# -*- coding: utf-8 -*-
#!/usr/bin/env python
# https://developers.google.com/custom-search/docs/xml_results#countryCodes
#https://www.linkedin.com/countserv/count/share?format=jsonp&url=https://www.buzzerstar.com
# pip install --upgrade spacy tensorflow gensim sumy keras markovify google-api-python-client beautifulsoup4
#from sphinxapi import *
import sys, time
import hashlib
import pickle
#import argparse
import json
#import sys
import re
import nltk
import os, sys, logging, re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import PunktSentenceTokenizer
from nltk.stem.snowball import SnowballStemmer
#pip3 install stop-words
from stop_words import get_stop_words
stopwordsDE = get_stop_words('de')
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("german")
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('german')
debug = False
max_gensim_results = 23
import os
import re
import sys
import codecs
import string
import time
import glob
import getopt
import argparse
from unidecode import unidecode
from datetime import datetime as dTime
from pprint import PrettyPrinter
import math
#from textblob import TextBlob as tb
#from textblob_de import TextBlobDE as tb
r_DateRegexList=list()
r_ForbiddenRegexList=list()
pSentenceDelimiter=re.compile("(\.|\?|\!)", re.IGNORECASE)
a0=re.compile("(\)\.|\)\!|\)\?)", re.IGNORECASE)
p01=re.compile("bzw\.", re.IGNORECASE)
p0=re.compile("Rdn\.", re.IGNORECASE)
p1=re.compile("Abs\.", re.IGNORECASE)
p2=re.compile("Nr\.", re.IGNORECASE)
p3=re.compile("Art\.", re.IGNORECASE)
p4=re.compile("Aufl\.", re.IGNORECASE)
p5=re.compile("vgl\.", re.IGNORECASE)
p6=re.compile("Einf\.", re.IGNORECASE)
p61=re.compile("ff\.", re.IGNORECASE)
p62=re.compile("gem\.", re.IGNORECASE)
p63=re.compile("Buchst\.", re.IGNORECASE)
p64=re.compile("Dipl\.-Ing\.", re.IGNORECASE)
p65=re.compile("Dipl\.", re.IGNORECASE)
p66=re.compile("Ing\.", re.IGNORECASE)
# neu: 30.6.2017
p67=re.compile("\"(\w{1,})\"", re.IGNORECASE)
p68=re.compile("\"(\d{1,})\"", re.IGNORECASE)
p691=re.compile("\'(\w{1,})\'", re.IGNORECASE)
p692=re.compile("\'(\d{1,})\'", re.IGNORECASE)
'''
p7=re.compile("(\d{1,})\.", re.IGNORECASE)
p8=re.compile("(\w{1})\.", re.IGNORECASE)
p9=re.compile("(\d{1,})\.(\d{1,})\.", re.IGNORECASE)
p10=re.compile("(\w{1})\.(\w{1})\.", re.IGNORECASE)
'''
p11=re.compile("(\d{1,2})\. (\w{3,}) (\d{2,})", re.IGNORECASE) # 14. April 2003
p12=re.compile("(\d{1,2})\.(\w{3,}) (\d{2,})", re.IGNORECASE)
p13=re.compile("(\d{1,2})\.(\d{1,2}) (\d{2,})", re.IGNORECASE)
p14=re.compile("(\d{1,2})\. (\d{1,2}) (\d{2,})", re.IGNORECASE)
p15=re.compile("(\d{1,2})\. (\d{1,2})\. (\d{2,})", re.IGNORECASE)
p16=re.compile("(\d{1,2})\.(\d{1,2})\.(\d{2,})", re.IGNORECASE)
p17=re.compile("(\d{1,2})\. (\w{2,})", re.IGNORECASE)
p18=re.compile("(\d{1,2})\.(\w{2,})", re.IGNORECASE)
r_DateRegexList.append(p11)
r_DateRegexList.append(p12)
r_DateRegexList.append(p13)
r_DateRegexList.append(p14)
r_DateRegexList.append(p15)
r_DateRegexList.append(p16)
r_DateRegexList.append(p17)
r_DateRegexList.append(p18)
r_ForbiddenRegexList.append(p01)
r_ForbiddenRegexList.append(p0)
r_ForbiddenRegexList.append(p1)
r_ForbiddenRegexList.append(p2)
r_ForbiddenRegexList.append(p3)
r_ForbiddenRegexList.append(p4)
r_ForbiddenRegexList.append(p5)
r_ForbiddenRegexList.append(p6)
'''
r_ForbiddenRegexList.append(p7)
r_ForbiddenRegexList.append(p8)
r_ForbiddenRegexList.append(p9)
r_ForbiddenRegexList.append(p10)
'''
r_ForbiddenRegexList.append(p11)
r_ForbiddenRegexList.append(p12)
r_ForbiddenRegexList.append(p13)
r_ForbiddenRegexList.append(p14)
r_ForbiddenRegexList.append(p15)
r_ForbiddenRegexList.append(p16)
r_ForbiddenRegexList.append(p61)
r_ForbiddenRegexList.append(p62)
r_ForbiddenRegexList.append(p63)
r_ForbiddenRegexList.append(p64)
r_ForbiddenRegexList.append(p65)
r_ForbiddenRegexList.append(p66)
# neu: 30.6.2017
#r_ForbiddenRegexList.append(p67)
#r_ForbiddenRegexList.append(p68)
#r_ForbiddenRegexList.append(p691)
#r_ForbiddenRegexList.append(p692)
#quelle: https://openjur.de/s/abkuerzungen.html
p692=re.compile("a\.A\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("a\.a\.O\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("a\.E\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("a\.F\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("abgedr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("abheb\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("abl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("ABl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Abs\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("abw\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Alt\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Anh\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Anm\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("arg\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Art\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Aufl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Bd\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("begl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("begr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Beil\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Beschl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Bespr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Best\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("bestr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("betr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Bfg\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("BfgGer\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Bl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("bzw\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("c\.i\.c\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("d\.h\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("dagg\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("dch\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("ders\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("h\.M\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.ü\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.d\.F\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.d\.R\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.E\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.e\.S\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.gl\.S\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.Grds\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.R\.d\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.R\.v\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.S\.v\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.V\.m\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.w\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.w\.S\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("jur\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("krit\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("lfd\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("lt\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("m\. Anm\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("m\. Bespr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("m\.w\.N\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("m\.W\.v\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("mdl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("mtl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("n\.F\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("n\.n\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("n\.rk\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("n\.w\.N\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("o\.J\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("ord\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Rdnr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Rdz\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("rel\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Rev\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Ri\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Rspr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("s\.a\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("s\.d\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("skept\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("sof\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("sog\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("st\. Rspr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("str\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("u\.a\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("u\.ä\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("umstr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("unbek\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("unbest\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("unpfb\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("unzul\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("unzustd\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Urt\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("v\.A\.w\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Var\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Verf\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Verh\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Verk\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Verz\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Vfg\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("vgl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Vorbem\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("vorgen\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("zusf\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("zust\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("zutr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("zw\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("\.{2,}", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("\!{2,}", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("\?{2,}", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("ca\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Prof\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Dr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("med\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
muster_content_abkuerzung = "mustererkennung_abkuerzung.txt"
muster_content_gericht = "mustererkennung_gericht.txt"
muster_content_gesetz = "mustererkennung_gesetz.txt"
lexikon_content = "lexikon_uniq.txt"
mustererkennung_gericht = open(muster_content_gericht, 'r', encoding='latin-1').read()
mustererkennung_gesetz = open(muster_content_gesetz, 'r', encoding='latin-1').read()
mustererkennung_abkuerzung = open(muster_content_abkuerzung, 'r', encoding='latin-1').read()
lexikon_lesen = open(lexikon_content, 'r', encoding='latin-1').read()
t_musterListGericht = mustererkennung_gericht.split("\n")
t_musterListGesetz = mustererkennung_gesetz.split("\n")
t_lexikonList = lexikon_lesen.split("\n")
t_musterListGericht = map(str.strip, t_musterListGericht)
t_musterListGesetz = map(str.strip, t_musterListGesetz)
t_lexikonList = map(str.strip, t_lexikonList)
def isMusterLexikonFound(text):
if not isinstance(text, str):
return ""
for ele in t_lexikonList:
ele = ele.strip()
if ele.startswith("#") or len(ele)<1:
continue
if ele in text:
return ele
if re.search(ele,text):
return ele
k = re.compile(r'\b%s\b' % ele)
if k.search(text):
return ele
return ""
def isMusterGesetzFound(text):
if not isinstance(text, str):
return ""
for ele in t_musterListGesetz:
ele = ele.strip()
if ele.startswith("#") or len(ele)<1:
continue
# idee:
# if " "+ele+" " in text:
#if ele in text: # kostet kaum rechenkraft
k = re.compile(r'\b%s\b' % ele) # kostet enorm rechenkraft
if k.search(text):
return ele
return ""
def isMusterGerichtFound(text):
if not isinstance(text, str):
return ""
for ele in t_musterListGericht:
ele = ele.strip()
if ele.startswith("#") or len(ele)<1:
continue
#if ele in text:
# idee:
# if " "+ele+" " in text:
#if re.search(ele, text):
k = re.compile(r'\b%s\b' % ele) # kostet enorm rechenkraft
if k.search(text):
return ele
return ""
def isLexikonSimple(text):
lexikon=set();
ele = text.strip()
sentElements=ele.split()
for s in sentElements:
r_String=isMusterLexikonFound(s)
if len(r_String)>0:
#print("Result:", r_String, " s->:", s)
lexikon.add(r_String)
return lexikon
def isGerichtSimple(text):
gericht=set();
ele = text.strip()
sentElements=ele.split()
for s in sentElements:
r_String=isMusterGerichtFound(s)
if len(r_String)>0:
#print("Result:", r_String, " s->:", s)
gericht.add(r_String)
return gericht
def isGesetzSimple(text):
gesetz=set();
ele = text.strip()
sentElements= ele.split()
for s in sentElements:
r_String=isMusterGesetzFound(s)
if len(r_String)>0:
#print("Result:", r_String, " s->:", s)
gesetz.add(r_String)
return gesetz
def only_numerics(seq):
seq_type= type(seq)
return seq_type().join(filter(seq_type.isdigit, seq))
def only_letter(seq):
seq_type= type(seq)
return seq_type().join(filter(seq_type.isalpha, seq))
def isParagraphSimple(text):
"""
§ 3 EStG
§ 2 Abs 1 EStG
§§ 2 und 3 EStG
§ 22 Nr 5 EStG
§ 38a Abs 1 Satz 3 Einkommenssteuergesetz
§ 2 Abs 7 Satz 2
SozR 4-7837 § 1 Nr 4
§ 2 Abs 1 Satz 1 Nr 1 bis 4 EStG
Allgemein: Paragraph Nummer Wort(kurz oder lang)
"""
paragraph=set();
t_musterListGesetz=mustererkennung_gesetz.split("\n")
sentElements=text.strip()
#r = re.compile("\§ \d{1,6} \w{2,}")
matches = re.findall("\§ \d{1,6} \w{2,}", sentElements)
#newlist = filter(r.match, t_musterListGesetz)
#matches = re.findall("\§ \d{1,6}\s\w+", sentElements)
#m = re.search("\§ \d \w", sentElements)
if matches:
#print("Cur Sent:", sentElements)
for match in matches:
match=match.strip()
#print('Found = {}'.format(match))
if "Abs" not in match:
t_ssplit=match.split()
r_last=t_ssplit[-1]
r_last=only_letter(r_last)
for tmV1 in t_musterListGesetz:
if r_last in tmV1 and len(r_last)>1 and len(tmV1)>1 and len(r_last) == len(tmV1):
#print("Adding:",match )
paragraph.add(match)
if "Abs" in match or "Nr" in match:
r_Index=sentElements.index(match)
r_Content=sentElements[r_Index:len(sentElements)]
r_split=r_Content.split()
#print("Position:", r_Index)
#print("Position Content:", r_Content)
for tm in t_musterListGesetz:
tm = tm.strip()
#print("Mustererkennung:", tm)
for rw in r_split:
rw = rw.strip()
#rw = only_numerics(rw)
rw = only_letter(rw)
#print("texterkennung:",tm,"->",rw)
if rw in tm and len(rw)>1 and len(tm)>1 and len(rw) == len(tm):
r_IndexV2=0
try:
r_IndexV2=sentElements.index(rw)
except Exception as e:
1
if r_IndexV2 > r_Index:
r_ContentV2=sentElements[r_Index:r_IndexV2]
r_Fullmatch=r_ContentV2+" "+tm
#print("AdvMatch: ", rw, " at Position:", r_IndexV2)
#print("Position Content:", r_Content)
#print("FullMatch:", r_Fullmatch)
#paragraph.add("DEBUG: "+r_Fullmatch)
match=match.replace("Abs", "")
match=match.replace("Nr", "")
paragraph.add(match+tm)
return paragraph
def encodeToLatin1(text):
#n_String=replaceUmlauts(text)
encResults = text.encode('utf-8', "ignore")
#encResults = text.encode('utf-8', "ignore")
return str(encResults.decode('latin-1', "ignore"))
def encodeToAscii(text):
#n_String=replaceUmlauts(text)
encResults = text.encode('utf-8', "ignore")
#encResults = text.encode('utf-8', "ignore")
return str(encResults.decode('ascii', "replace"))
def encodeToUTF8Adv(text):
encResults = text.encode('utf-8', "ignore")
#return str(encResults.decode('latin-1', "ignore"))
return str(encResults.decode('utf-8', "remove"))
def isDate(word):
isDateElement=False
for r in r_DateRegexList:
y = r.search(word)
if y is not None:
isDateElement=True
return isDateElement
def isSentenceEnding(word):
isSentenceEndingElement=False
y = pSentenceDelimiter.search(word)
y2 = a0.search(word)
p1 = re.compile("(\w{2,})", re.IGNORECASE)
x1 = p1.search(word)
if x1 is not None and len(word)>=2 and y is not None and y2 is not None:
"""
print("#######################################")
print("x1",x1)
print("len(word)",len(word))
print("y",y)
print("y2",y2)
print("Sentence Ending Plus:", word)
"""
return True
else:
"""print("#######################################")
print("x1",x1)
print("len(word)",len(word))
print("y",y)
print("y2",y2)
print("Sentence Ending Minus:", word)
"""
1
if y is not None:
isSentenceEndingElement=True
if y2 is not None:
isSentenceEndingElement=True
for r in r_ForbiddenRegexList:
x = r.search(word)
if x is not None and y2 is None: # if we have something like "laufen (S.E.0)"
#print("isSentenceEndingElement:",x)
#plainWriteAppend("/tmp/splitter.txt","Found Negativ Match:"+x+" "+word)
return False
if isSentenceEndingElement is True and len(word)>2:
#plainWriteAppend("/tmp/splitter.txt","Positive SentenceEnding Match:"+word)
return True
elif isSentenceEndingElement is True and word in " ":
#plainWriteAppend("/tmp/splitter.txt","Positive SentenceEnding Match:"+word)
return True
else:
#print("word ending false:",word)
#plainWriteAppend("/tmp/splitter.txt","Negativ SentenceEnding Match:"+word)
return False
def mySentenceSplitter(text):
words = text.split()
oneSent=""
allSent=list()
#for w in words:
for x in range(0, len(words)-1):
c=words[x-1]
a=words[x]
b=words[x+1]
w=a
v=c+" "+a+" "+b
oneSent_c=oneSent+" "+c+" "+w
oneSent=oneSent+" "+w
#oneSent=oneSent+" ;;;"+v
#and isDate(v) is False
if isSentenceEnding(w) is True and isDate(v) is False:
#oneSent=oneSent+"{}".format(isDate(v))+"
"
#oneSent=oneSent+" "+"
"
y1 = pSentenceDelimiter.search(w)
y2 = a0.search(w)
mystr=w
lastchar=mystr[-1:]
lastchar=lastchar.strip()
last4char=mystr[-1:]
last4char=lastchar.strip()
sentence_delimiter=['.','!','?']
#if y1 is None or y2 is None or ('.' not in lastchar or '!' not in lastchar or '?' not in lastchar): # no sentence delimiter
#if y1 is None or y2 is None or lastchar.find("\.") == -1 or lastchar.find("\!") == -1 or lastchar.find("\?") == -1:
b_count=0 # bad count: drei mal darf der sentence_delimiter nicht gefunden werden
for s in sentence_delimiter:
if lastchar == s:
"""
print("isSentenceEnding(true) w:",w)
print("isSentenceEnding(true) s:",s)
print("isSentenceEnding(true) lastchar:",lastchar)
oneSent_c=oneSent_c+"\n"
allSent.append(oneSent_c)
oneSent_c=""
words[x-1]=""
words[x]=""
"""
else:
"""
print("isSentenceEnding(true) not w:",w)
print("isSentenceEnding(true) not s:",s)
print("isSentenceEnding(true) lastchar:",lastchar)
"""
b_count=b_count+1
#print("b_count:",b_count)
if b_count == len(sentence_delimiter):
#oneSent_c=oneSent_c+"\n"
#print("Sentence (bad ending):",oneSent_c)
#print("Sentence (bad ending) x:",x)
#allSent.append(oneSent_c)
#oneSent_c=""
#words[x-1]=""
#words[x]=""
1
#elif b_count == 2 and len(w)>=3:
# 1
else:
oneSent=oneSent+" "+"\n"
#print("Sentence (good ending):",oneSent)
#print("Sentence (good ending) x:",x)
allSent.append(oneSent)
oneSent=""
#exit(1)
return allSent
def topicModeling(text):
#print(type(text))
#print(len(text))
#if len(text) < 150:
# return float(0.0)
words = []
texts = []
text2 = mySentenceSplitter(text)
# list for tokenized documents in loop
#words = [w for w in text2 if not w in stopwords or not w in stopwordsDE]
for w in text2:
a=w.split()
for t in a:
if t.lower() in stopwords:
continue
if t.lower() in stopwordsDE:
continue
if t in stopwords:
continue
if t in stopwordsDE:
continue
if len(t)<3:
continue
if not t.isalnum():
continue
if t.lower() in cachedStopWords:
continue
if t in cachedStopWords:
continue
#if not sentify.isNotBlacklisted(t):
# continue
tv1=encodeToUTF8Adv(t)
words.append(tv1)
# Number of trainings epochs
num_epochs = 5
# Number of topics to show
num_topics_my=23
# Number of threads to run in parallel
num_workers=8
# Context window size
minimum_probability_my=0.75#0.55
texts=words
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary([words])
#convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(t) for t in [words]]
if debug:
print("gensim.topicModeling(): Start MultiCore Topic Modelling")
ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics=num_topics_my, id2word=dictionary, chunksize=30000, passes=num_epochs, workers=num_workers)
b = ldamodel.get_document_topics(corpus, minimum_probability=minimum_probability_my, per_word_topics=False)
count = 0
c = set()
for ele in b:
for e in ele:
c.add(e)
resultSet = set()
resultList = []
d = sorted(c, reverse=True, key=lambda x: x[1])
if debug:
print("gensim.topicModeling(): Parsing Results of Topic Modelling")
for e in d:
document_id = e[0]
f = ldamodel.show_topic(document_id, topn=15)
#print(f)
for f_e in f:
word = f_e[0]#encodeToUTF8Adv(f_e[0])
if len(resultList) >= max_gensim_results:
#print("gensim.topicModeling():", resultList)
return resultList
if word not in resultSet:
resultList.append(word)
resultSet.add(word)
if debug:
print("gensim.topicModeling():", resultList)
return resultList
"""
def mySentenceSplitterStringReturn(text):
from sumy.models.dom._sentence import Sentence
from sumy.nlp.tokenizers import Tokenizer
words = text.split()
oneSent=""
allSent=list()
allReturn=""
#for w in words:
for x in range(0, len(words)-1):
c=words[x-1]
a=words[x]
b=words[x+1]
w=a
v=c+" "+a+" "+b
oneSent=oneSent+" "+w
#oneSent=oneSent+" ;;;"+v
#and isDate(v) is False
if isSentenceEnding(w) is True and isDate(v) is False:
my_o=oneSent
s = Sentence(my_o, Tokenizer("german"))
allReturn+=my_o+" "+"\n"
#oneSent=oneSent+"{}".format(isDate(v))+"
"
#oneSent=oneSent+" "+"
"
oneSent=oneSent+" "+"\n"
allSent.append(s)
oneSent=""
return allSent
"""
def convertStr(s):
"""Convert string to either int or float."""
try:
ret = int(s)
except ValueError:
#Try float.
ret = float(s)
return ret