# -*- coding: utf-8 -*-
#!/usr/bin/env python
# https://developers.google.com/custom-search/docs/xml_results#countryCodes
#https://www.linkedin.com/countserv/count/share?format=jsonp&url=https://www.buzzerstar.com 
# pip install --upgrade spacy tensorflow gensim sumy keras markovify google-api-python-client beautifulsoup4
#from sphinxapi import *
import sys, time
import hashlib
import pickle
#import argparse
import json
#import sys

import re
import nltk
import os, sys, logging, re

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import PunktSentenceTokenizer
from nltk.stem.snowball import SnowballStemmer

#pip3 install stop-words
from stop_words import get_stop_words
stopwordsDE = get_stop_words('de')

from nltk.corpus import stopwords

cachedStopWords = stopwords.words("german")

# load nltk's English stopwords as variable called 'stopwords'
stopwords 	= nltk.corpus.stopwords.words('german')

debug				= False
max_gensim_results 	= 23

import os
import re
import sys
import codecs
import string
import time
import glob
import getopt
import argparse
from unidecode import unidecode
from datetime import datetime as dTime
from pprint import PrettyPrinter
import math
#from textblob import TextBlob as tb
#from textblob_de import TextBlobDE as tb

r_DateRegexList=list()
r_ForbiddenRegexList=list()

pSentenceDelimiter=re.compile("(\.|\?|\!)", re.IGNORECASE)

a0=re.compile("(\)\.|\)\!|\)\?)", re.IGNORECASE)
p01=re.compile("bzw\.", re.IGNORECASE)
p0=re.compile("Rdn\.", re.IGNORECASE)
p1=re.compile("Abs\.", re.IGNORECASE)
p2=re.compile("Nr\.", re.IGNORECASE)
p3=re.compile("Art\.", re.IGNORECASE)
p4=re.compile("Aufl\.", re.IGNORECASE)
p5=re.compile("vgl\.", re.IGNORECASE)
p6=re.compile("Einf\.", re.IGNORECASE)
p61=re.compile("ff\.", re.IGNORECASE)
p62=re.compile("gem\.", re.IGNORECASE)
p63=re.compile("Buchst\.", re.IGNORECASE)
p64=re.compile("Dipl\.-Ing\.", re.IGNORECASE)
p65=re.compile("Dipl\.", re.IGNORECASE)
p66=re.compile("Ing\.", re.IGNORECASE)
# neu: 30.6.2017
p67=re.compile("\"(\w{1,})\"", re.IGNORECASE)
p68=re.compile("\"(\d{1,})\"", re.IGNORECASE)
p691=re.compile("\'(\w{1,})\'", re.IGNORECASE)
p692=re.compile("\'(\d{1,})\'", re.IGNORECASE)
'''
p7=re.compile("(\d{1,})\.", re.IGNORECASE)
p8=re.compile("(\w{1})\.", re.IGNORECASE)
p9=re.compile("(\d{1,})\.(\d{1,})\.", re.IGNORECASE)
p10=re.compile("(\w{1})\.(\w{1})\.", re.IGNORECASE)
'''
p11=re.compile("(\d{1,2})\. (\w{3,}) (\d{2,})", re.IGNORECASE)	# 14. April 2003
p12=re.compile("(\d{1,2})\.(\w{3,}) (\d{2,})", re.IGNORECASE)
p13=re.compile("(\d{1,2})\.(\d{1,2}) (\d{2,})", re.IGNORECASE)
p14=re.compile("(\d{1,2})\. (\d{1,2}) (\d{2,})", re.IGNORECASE)
p15=re.compile("(\d{1,2})\. (\d{1,2})\. (\d{2,})", re.IGNORECASE)
p16=re.compile("(\d{1,2})\.(\d{1,2})\.(\d{2,})", re.IGNORECASE)
p17=re.compile("(\d{1,2})\. (\w{2,})", re.IGNORECASE)
p18=re.compile("(\d{1,2})\.(\w{2,})", re.IGNORECASE)

r_DateRegexList.append(p11)
r_DateRegexList.append(p12)
r_DateRegexList.append(p13)
r_DateRegexList.append(p14)
r_DateRegexList.append(p15)
r_DateRegexList.append(p16)
r_DateRegexList.append(p17)
r_DateRegexList.append(p18)

r_ForbiddenRegexList.append(p01)
r_ForbiddenRegexList.append(p0)
r_ForbiddenRegexList.append(p1)
r_ForbiddenRegexList.append(p2)
r_ForbiddenRegexList.append(p3)
r_ForbiddenRegexList.append(p4)
r_ForbiddenRegexList.append(p5)
r_ForbiddenRegexList.append(p6)
'''
r_ForbiddenRegexList.append(p7)
r_ForbiddenRegexList.append(p8)
r_ForbiddenRegexList.append(p9)
r_ForbiddenRegexList.append(p10)
'''
r_ForbiddenRegexList.append(p11)
r_ForbiddenRegexList.append(p12)
r_ForbiddenRegexList.append(p13)
r_ForbiddenRegexList.append(p14)
r_ForbiddenRegexList.append(p15)
r_ForbiddenRegexList.append(p16)
r_ForbiddenRegexList.append(p61)
r_ForbiddenRegexList.append(p62)
r_ForbiddenRegexList.append(p63)
r_ForbiddenRegexList.append(p64)
r_ForbiddenRegexList.append(p65)
r_ForbiddenRegexList.append(p66)

# neu: 30.6.2017
#r_ForbiddenRegexList.append(p67)
#r_ForbiddenRegexList.append(p68)
#r_ForbiddenRegexList.append(p691)
#r_ForbiddenRegexList.append(p692)

#quelle: https://openjur.de/s/abkuerzungen.html
p692=re.compile("a\.A\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("a\.a\.O\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("a\.E\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("a\.F\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("abgedr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("abheb\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("abl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("ABl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Abs\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("abw\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Alt\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Anh\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Anm\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("arg\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Art\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Aufl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Bd\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("begl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("begr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Beil\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Beschl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Bespr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Best\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("bestr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("betr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Bfg\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("BfgGer\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Bl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("bzw\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("c\.i\.c\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("d\.h\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("dagg\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("dch\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("ders\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("h\.M\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.ü\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.d\.F\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.d\.R\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.E\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.e\.S\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.gl\.S\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.Grds\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.R\.d\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.R\.v\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.S\.v\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.V\.m\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.w\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("i\.w\.S\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("jur\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("krit\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("lfd\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("lt\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("m\. Anm\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("m\. Bespr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("m\.w\.N\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("m\.W\.v\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("mdl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("mtl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("n\.F\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("n\.n\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("n\.rk\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("n\.w\.N\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("o\.J\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("ord\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Rdnr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Rdz\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("rel\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Rev\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Ri\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Rspr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("s\.a\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("s\.d\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("skept\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("sof\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("sog\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("st\. Rspr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("str\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("u\.a\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("u\.ä\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("umstr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("unbek\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("unbest\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("unpfb\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("unzul\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("unzustd\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Urt\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("v\.A\.w\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Var\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Verf\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Verh\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Verk\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Verz\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Vfg\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("vgl\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Vorbem\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("vorgen\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("zusf\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("zust\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("zutr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("zw\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("\.{2,}", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("\!{2,}", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("\?{2,}", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("ca\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Prof\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("Dr\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)
p692=re.compile("med\.", re.IGNORECASE)
r_ForbiddenRegexList.append(p692)

muster_content_abkuerzung	= "mustererkennung_abkuerzung.txt"
muster_content_gericht		= "mustererkennung_gericht.txt"
muster_content_gesetz		= "mustererkennung_gesetz.txt"
lexikon_content				= "lexikon_uniq.txt"

mustererkennung_gericht		= open(muster_content_gericht, 'r', encoding='latin-1').read()
mustererkennung_gesetz		= open(muster_content_gesetz, 'r', encoding='latin-1').read()
mustererkennung_abkuerzung	= open(muster_content_abkuerzung, 'r', encoding='latin-1').read()
lexikon_lesen				= open(lexikon_content, 'r', encoding='latin-1').read()

t_musterListGericht 		= mustererkennung_gericht.split("\n")
t_musterListGesetz 			= mustererkennung_gesetz.split("\n")
t_lexikonList	 			= lexikon_lesen.split("\n")

t_musterListGericht 		= map(str.strip, t_musterListGericht)
t_musterListGesetz 			= map(str.strip, t_musterListGesetz)
t_lexikonList	 			= map(str.strip, t_lexikonList)
	
def isMusterLexikonFound(text):
	if not isinstance(text, str):
		return ""
	
	for ele in t_lexikonList:
		ele	= ele.strip()
		if ele.startswith("#") or len(ele)<1:
			continue
		if ele in text:
			return ele
		if re.search(ele,text):
			return ele
		k = re.compile(r'\b%s\b' % ele)
		if k.search(text):
			return ele
	return ""

def isMusterGesetzFound(text):
	if not isinstance(text, str):
		return ""
	
	for ele in t_musterListGesetz:
		ele	= ele.strip()
		if ele.startswith("#") or len(ele)<1:
			continue
		
		# idee:
			# if " "+ele+" " in text:
		#if ele in text:				# kostet kaum rechenkraft
		k = re.compile(r'\b%s\b' % ele) # kostet enorm rechenkraft
		if k.search(text):
			return ele
	return ""

def isMusterGerichtFound(text):
	if not isinstance(text, str):
		return ""
	
	for ele in t_musterListGericht:
		ele	= ele.strip()
		if ele.startswith("#") or len(ele)<1:
			continue
		#if ele in text:
		# idee:
			# if " "+ele+" " in text:
		#if re.search(ele, text):
		k = re.compile(r'\b%s\b' % ele)	# kostet enorm rechenkraft
		if k.search(text):
			return ele
	return ""

def isLexikonSimple(text):
	lexikon=set();
	ele	= text.strip()
	sentElements=ele.split()
	for s in sentElements:
		r_String=isMusterLexikonFound(s)
		if len(r_String)>0:
			#print("Result:", r_String, " s->:", s)
			lexikon.add(r_String)
	return lexikon

def isGerichtSimple(text):
	gericht=set();
	ele	= text.strip()
	sentElements=ele.split()
	for s in sentElements:
		r_String=isMusterGerichtFound(s)
		if len(r_String)>0:
			#print("Result:", r_String, " s->:", s)
			gericht.add(r_String)
	return gericht

def isGesetzSimple(text):
	gesetz=set();
	ele	= text.strip()
	sentElements= ele.split()
	for s in sentElements:
		r_String=isMusterGesetzFound(s)
		if len(r_String)>0:
			#print("Result:", r_String, " s->:", s)
			gesetz.add(r_String)
	return gesetz

def only_numerics(seq):
	seq_type= type(seq)
	return seq_type().join(filter(seq_type.isdigit, seq))

def only_letter(seq):
	seq_type= type(seq)
	return seq_type().join(filter(seq_type.isalpha, seq))

def isParagraphSimple(text):
	"""
	§ 3 EStG
	§ 2 Abs 1 EStG
	§§ 2 und 3 EStG
	§ 22 Nr 5 EStG
	§ 38a Abs 1 Satz 3 Einkommenssteuergesetz
	§ 2 Abs 7 Satz 2 
	SozR 4-7837 § 1 Nr 4
	§ 2 Abs 1 Satz 1 Nr 1 bis 4 EStG

	Allgemein: Paragraph Nummer Wort(kurz oder lang)
	"""
	
	paragraph=set();
	t_musterListGesetz=mustererkennung_gesetz.split("\n")
	sentElements=text.strip()
	#r = re.compile("\§ \d{1,6} \w{2,}")
	matches = re.findall("\§ \d{1,6} \w{2,}", sentElements)
	#newlist = filter(r.match, t_musterListGesetz)
	#matches = re.findall("\§ \d{1,6}\s\w+", sentElements)
	#m = re.search("\§ \d \w", sentElements)
	if matches:
		#print("Cur Sent:", sentElements)
		for match in matches:
			match=match.strip()
			#print('Found = {}'.format(match))
			if "Abs" not in match:
				t_ssplit=match.split()
				r_last=t_ssplit[-1]
				r_last=only_letter(r_last)
				for tmV1 in t_musterListGesetz:
					if r_last in tmV1 and len(r_last)>1 and len(tmV1)>1 and len(r_last) == len(tmV1):
						#print("Adding:",match )
						paragraph.add(match)
			
			if "Abs" in match or "Nr" in match:
				r_Index=sentElements.index(match)
				r_Content=sentElements[r_Index:len(sentElements)]
				r_split=r_Content.split()
				#print("Position:", r_Index)
				#print("Position Content:", r_Content)
				for tm in t_musterListGesetz:
					tm = tm.strip()
					#print("Mustererkennung:", tm)
					for rw in r_split:
						rw = rw.strip()
						#rw = only_numerics(rw)
						rw = only_letter(rw)
						#print("texterkennung:",tm,"->",rw)
						if rw in tm and len(rw)>1 and len(tm)>1 and len(rw) == len(tm):
							r_IndexV2=0
							
							try:
								r_IndexV2=sentElements.index(rw)
							except Exception as e:
								1
							
							if r_IndexV2 > r_Index:
								r_ContentV2=sentElements[r_Index:r_IndexV2]
								r_Fullmatch=r_ContentV2+" "+tm
								#print("AdvMatch: ", rw, " at Position:", r_IndexV2)
								#print("Position Content:", r_Content)
								#print("FullMatch:", r_Fullmatch)
								#paragraph.add("DEBUG: "+r_Fullmatch)
								match=match.replace("Abs", "")
								match=match.replace("Nr", "")
								paragraph.add(match+tm)
			
	return paragraph
	
def encodeToLatin1(text):
	#n_String=replaceUmlauts(text)
	encResults = text.encode('utf-8', "ignore")
	#encResults = text.encode('utf-8', "ignore")
	return str(encResults.decode('latin-1', "ignore"))

def encodeToAscii(text):
	#n_String=replaceUmlauts(text)
	encResults = text.encode('utf-8', "ignore")
	#encResults = text.encode('utf-8', "ignore")
	return str(encResults.decode('ascii', "replace"))
	
def encodeToUTF8Adv(text):
	encResults = text.encode('utf-8', "ignore")
	#return str(encResults.decode('latin-1', "ignore"))
	return str(encResults.decode('utf-8', "remove"))

def isDate(word):
	isDateElement=False
	for r in r_DateRegexList:
		y = r.search(word)
		if y is not None:
			isDateElement=True
	return isDateElement

def isSentenceEnding(word):
	isSentenceEndingElement=False
	
	y  = pSentenceDelimiter.search(word)
	y2 = a0.search(word)
	
	p1 = re.compile("(\w{2,})", re.IGNORECASE)
	x1 = p1.search(word)
	
	if x1 is not None and len(word)>=2 and y is not None and y2 is not None:
		"""
		print("#######################################")
		print("x1",x1)
		print("len(word)",len(word))
		print("y",y)
		print("y2",y2)
		print("Sentence Ending Plus:", word)
		"""
		return True
	else:
		"""print("#######################################")
		print("x1",x1)
		print("len(word)",len(word))
		print("y",y)
		print("y2",y2)
		print("Sentence Ending Minus:", word)
		"""
		1
		
	if y is not None:
		isSentenceEndingElement=True
	if y2 is not None:
		isSentenceEndingElement=True
	
	for r in r_ForbiddenRegexList:
		x = r.search(word)
		if x is not None and y2 is None: # if we have something like "laufen (S.E.0)"
			#print("isSentenceEndingElement:",x)
			#plainWriteAppend("/tmp/splitter.txt","Found Negativ Match:"+x+" "+word)
			return False
	
	if isSentenceEndingElement is True and len(word)>2:
		#plainWriteAppend("/tmp/splitter.txt","Positive SentenceEnding Match:"+word)
		return True
	elif isSentenceEndingElement is True and word in " ":
		#plainWriteAppend("/tmp/splitter.txt","Positive SentenceEnding Match:"+word)
		return True
	else:
		#print("word ending false:",word)
		#plainWriteAppend("/tmp/splitter.txt","Negativ SentenceEnding Match:"+word)
		return False

def mySentenceSplitter(text):
	words = text.split()
	oneSent=""
	allSent=list()
	#for w in words:
	for x in range(0, len(words)-1):
		c=words[x-1]
		a=words[x]
		b=words[x+1]
		w=a
		v=c+" "+a+" "+b
		oneSent_c=oneSent+" "+c+" "+w
		oneSent=oneSent+" "+w
		
		#oneSent=oneSent+" ;;;"+v
		#and isDate(v) is False
		
		if isSentenceEnding(w) is True and isDate(v) is False:
			#oneSent=oneSent+"{}".format(isDate(v))+"<br><br>"
			#oneSent=oneSent+" "+"<br><br>"
			y1 = pSentenceDelimiter.search(w)
			y2 = a0.search(w)
			mystr=w
			lastchar=mystr[-1:]
			lastchar=lastchar.strip()
			
			last4char=mystr[-1:]
			last4char=lastchar.strip()
			
			sentence_delimiter=['.','!','?']
			#if y1 is None or y2 is None or ('.' not in lastchar or '!' not in lastchar or '?' not in lastchar): # no sentence delimiter
			#if y1 is None or y2 is None or lastchar.find("\.") == -1 or lastchar.find("\!") == -1 or lastchar.find("\?") == -1:
			
			b_count=0 # bad count: drei mal darf der sentence_delimiter nicht gefunden werden
			for s in sentence_delimiter:
				if lastchar == s:
					"""
					print("isSentenceEnding(true) w:",w)
					print("isSentenceEnding(true) s:",s)
					print("isSentenceEnding(true) lastchar:",lastchar)
					oneSent_c=oneSent_c+"\n"
					allSent.append(oneSent_c)
					oneSent_c=""
					words[x-1]=""
					words[x]=""
					"""
				else:
					"""
					print("isSentenceEnding(true) not w:",w)
					print("isSentenceEnding(true) not s:",s)
					print("isSentenceEnding(true) lastchar:",lastchar)
					"""
					b_count=b_count+1
			
			#print("b_count:",b_count)
			if b_count == len(sentence_delimiter):
				#oneSent_c=oneSent_c+"\n"
				#print("Sentence (bad ending):",oneSent_c)
				#print("Sentence (bad ending) x:",x)
				#allSent.append(oneSent_c)
				#oneSent_c=""
				#words[x-1]=""
				#words[x]=""
				1
			#elif b_count == 2 and len(w)>=3:
			#	1
			else:
				oneSent=oneSent+" "+"\n"
				#print("Sentence (good ending):",oneSent)
				#print("Sentence (good ending) x:",x)
				allSent.append(oneSent)
				oneSent=""
	
	#exit(1)
	return allSent

def topicModeling(text):
	#print(type(text))
	#print(len(text))
	
	#if len(text) < 150:
	#	return float(0.0)
	
	words	= []
	texts 	= []
	text2 	= mySentenceSplitter(text)
	
	# list for tokenized documents in loop
	#words 		= [w for w in text2 if not w in stopwords or not w in stopwordsDE]
	for w in text2:
		a=w.split()
		for t in a:
			if t.lower() in stopwords:
				continue
			if t.lower() in stopwordsDE:
				continue
			if t in stopwords:
				continue
			if t in stopwordsDE:
				continue
			if len(t)<3:
				continue
			if not t.isalnum():
				continue
			if t.lower() in cachedStopWords:
				continue
			if t in cachedStopWords:
				continue
			#if not sentify.isNotBlacklisted(t):
			#	continue
			
			tv1=encodeToUTF8Adv(t)
			words.append(tv1)
	
	# Number of trainings epochs
	num_epochs = 5
	# Number of topics to show
	num_topics_my=23
	# Number of threads to run in parallel
	num_workers=8
	# Context window size
	minimum_probability_my=0.75#0.55
	texts=words
	# turn our tokenized documents into a id <-> term dictionary
	dictionary	= corpora.Dictionary([words])
	
	#convert the dictionary to a bag of words corpus for reference
	corpus 		= [dictionary.doc2bow(t) for t in [words]]
	
	if debug:
		print("gensim.topicModeling(): Start MultiCore Topic Modelling")
	ldamodel 	= gensim.models.ldamulticore.LdaMulticore(corpus, num_topics=num_topics_my, id2word=dictionary, chunksize=30000, passes=num_epochs, workers=num_workers)
	b 			= ldamodel.get_document_topics(corpus, minimum_probability=minimum_probability_my, per_word_topics=False)
	
	count 		= 0
	c 			= set()
	for ele in b:
		for e in ele:
			c.add(e)
	
	resultSet 	= set()
	resultList 	= []
	d 			= sorted(c, reverse=True, key=lambda x: x[1])
	
	if debug:
		print("gensim.topicModeling(): Parsing Results of Topic Modelling")
	for e in d:
		document_id = e[0]
		f 			= ldamodel.show_topic(document_id, topn=15)
		#print(f)
		for f_e in f:
			word 	= f_e[0]#encodeToUTF8Adv(f_e[0])
			
			if len(resultList) >= max_gensim_results:
				#print("gensim.topicModeling():", resultList)
				return resultList
			
			if word not in resultSet:
				resultList.append(word)
				resultSet.add(word)
	if debug:
		print("gensim.topicModeling():", resultList)
	return resultList

"""
def mySentenceSplitterStringReturn(text):
	from sumy.models.dom._sentence import Sentence
	from sumy.nlp.tokenizers import Tokenizer
	words = text.split()
	oneSent=""
	allSent=list()
	allReturn=""
	#for w in words:
	for x in range(0, len(words)-1):
		c=words[x-1]
		a=words[x]
		b=words[x+1]
		w=a
		v=c+" "+a+" "+b
		oneSent=oneSent+" "+w
		#oneSent=oneSent+" ;;;"+v
		#and isDate(v) is False
		if isSentenceEnding(w) is True and isDate(v) is False:
			my_o=oneSent
			s = Sentence(my_o, Tokenizer("german"))
			
			allReturn+=my_o+" "+"\n"
			#oneSent=oneSent+"{}".format(isDate(v))+"<br><br>"
			#oneSent=oneSent+" "+"<br><br>"
			oneSent=oneSent+" "+"\n"
			allSent.append(s)
			oneSent=""
	return allSent
"""

def convertStr(s):
	"""Convert string to either int or float."""
	try:
		ret = int(s)
	except ValueError:
		#Try float.
		ret = float(s)
	return ret