#!/usr/bin/python
# -*- coding: ISO-8859-1 -*-
from __future__ import division
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
###########################
### Autor: Sebastian Enger / M.Sc.
### Copyright: Sebastian Enger
### Licence: Commercial / OneTipp
### Version: 1.0.7 - 17-10-2015@23:53 Uhr
### Contact: sebastian.enger@gmail.com
### OneTipp Text Tool in Python
###########################
#https://docs.python.org/2/library/configparser.html
######## export PYTHON_EGG_CACHE=/tmp
import pprint
import os
import nltk
# import rocksdb # shared library kann aktuell noch nicht gelesen werden
import MySQLdb # apt-get install python-mysqldb
from sphinxit.core.processor import Search # http://sphinxit.readthedocs.org/en/latest/
from sphinxit.core.helpers import BaseSearchConfig
from random import randint
from past.builtins import basestring # pip install future
import codecs
import sys
from sumy.parsers.plaintext import PlaintextParser # https://github.com/miso-belica/sumy
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
import re
from transliterate import translit, get_available_language_codes
import libleipzig
import pprint
import json
from textstat.textstat import textstat # https://pypi.python.org/pypi/textstat
os.environ['PYTHON_EGG_CACHE'] = '/home/compress/'
###python -m nltk.downloader -d /usr/share/nltk_data all
####python -m nltk.downloader all
###########nltk.download()
# nltk.download("punkt")
reload(sys)
sys.setdefaultencoding('utf-8')
noDoubleHash = set()
###re_match = r"[(\?|\.|\!)][(\t|\r|\n|\s|\w){0,}]([A-Za-z0-9]{1,})" # Match: ". WORT"
re_match = r"(\?|\.|\!)$" # Match: ". WORT"
# lies die Ein und Ausgabedateien
inputfile = sys.argv[1]
outputfile = sys.argv[2]
# http://www.tutorialspoint.com/python/python_command_line_arguments.htm
# read file into string
text = open(inputfile, 'r').read()
#text.decode('utf-8')
text = text.decode("utf-8")
class SphinxitConfig(BaseSearchConfig):
DEBUG = False
WITH_META = False
WITH_STATUS = False
POOL_SIZE = 5
# SQL_ENGINE = 'oursql'
SEARCHD_CONNECTION = {
'host': '127.0.0.1',
'port': 9977,
}
# delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r']
# http://pyrocksdb.readthedocs.org/en/v0.4/tutorial/index.html
# https://github.com/sphinxsearch/sphinx/blob/master/api/sphinxapi.py
# http://www.tutorialspoint.com/python/python_database_access.htm
# mysql = MySQLdb.connect("localhost","root","###########99","onetipp" ) # last working
sphinx = MySQLdb.connect(
host='127.0.0.1',
user='root',
passwd='###########99',
db='onetipp',
port=9977) # sphinxQL
cursorSphinx = sphinx.cursor()
mysql = MySQLdb.connect(
host='127.0.0.1',
user='root',
passwd='###########99',
db='onetipp',
port=3306) # Mysql
mysql.autocommit(True)
cursorMysql = mysql.cursor()
def log_warnings(curs):
for msg in curs.messages:
if msg[0] == MySQLdb.Warning:
logging.warn(msg[1])
def deumlaut(s):
"""
Replaces umlauts with fake-umlauts
"""
s = s.replace('\xdf', 'ss')
s = s.replace('\xfc', 'ü')
s = s.replace('\xdc', 'Ü')
s = s.replace('\xf6', 'ö')
s = s.replace('\xd6', 'Ö')
s = s.replace('\xe4', 'ä')
s = s.replace('\xc4', 'Ä')
#s = s.replace('\xdf', 'ss')
#s = s.replace('\xfc', 'ue')
#s = s.replace('\xdc', 'Ue')
#s = s.replace('\xf6', 'oe')
#s = s.replace('\xd6', 'Oe')
#s = s.replace('\xe4', 'ae')
# s = s.replace('\xc4', 'Ae')
return s
def summarizeText(s):
## sumy: https://github.com/miso-belica/sumy/tree/dev/sumy/summarizers
sentences = nltk.sent_tokenize(s)
sentenceCount = len(sentences)
randSentenceCount = randint(int((sentenceCount/100)*90)+1, sentenceCount)
# randCount = random.randint(iround(float((sentenceCount / 100) * 55)), iround(sentenceCount))
parser = PlaintextParser.from_string(s, Tokenizer("german"))
stemmer = Stemmer("german")
# summarizer = TextRankSummarizer(stemmer)
summarizer = Summarizer(stemmer)
summary = summarizer(parser.document, randSentenceCount)
returnText = ""
#ISO-8859-1
for sentence in summary:
returnText += str(sentence)
returnText += " "
return returnText
# Todos:
# create a stopword list in German
# if a stopword is part of a synonym
# give bad minus points
def SynRanker(s,t):
if not s or not t:
return -10
else:
1
if not isinstance(s, basestring) or not isinstance(t, basestring):
return -10
else:
1
startVal = float(1.0)
lenSyn = len(s)
synHasDigits = any(i.isdigit() for i in s)
synhasSonder = False
delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r']
re_sonder = r"(\?|\.|\,|\;|\:|\!|\d)"
re_space = r"(\t|\r|\n|\s|\w)"
firstS = s[0:1]
firstT = t[0:1]
if s == t:
startVal -= -0.95
return -1
else:
1
if lenSyn <= 0:
startVal -= -0.99
return -10
else:
1
if lenSyn > 3 and lenSyn < 14:
startVal += 0
elif lenSyn <= 3:
startVal -= 0.35
else:
1
if (' ' in s) and lenSyn >= 14:
startVal -= 0.75
elif (' ' in s) and lenSyn < 14:
startVal -= 0.55
elif (' ' not in s) and lenSyn >= 14:
startVal -= 0.05
elif (' ' not in s) and lenSyn < 14:
startVal += 0.05
else:
1
if re.search(re_space, s) is not None:
startVal -= 0.50
else:
1
if re.search(re_sonder, s) is not None:
startVal -= 0.075
synhasSonder = True
else:
1
if firstS.isupper() and firstT.isupper():
startVal += 0.15
elif firstS.islower() and firstT.islower():
startVal += 0.15
elif firstS.isupper() and not firstT.isupper():
startVal -= 0.25
elif firstS.islower() and not firstT.islower():
startVal -= 0.25
else:
1
#print("Synonym: ", s)
#print("
")
#print("Length: ", lenSyn)
#print("
")
# print("Digits: ", synHasDigits)
#print("
")
#print("Space: ", (' ' in s))
#print("
")
#print("Sonderzeichen: ", synhasSonder)
#print("
")
#print("SynRank: ", startVal)
#print("
")
#print("---------------------------------------------------
")
# later ResultCodes
return float(startVal)
def iround(x):
"""iround(number) -> integer
Round a number to the nearest integer."""
return int(round(x) - .5) + (x > 0)
def getSynLeipzig(sl):
#print ("Auto Syn - Leipzig: ", libleipzig.Thesaurus("Auto",10))
retContent = []
retSaveMysql = "W:"+sl
if not sl:
return retContent
elif not isinstance(sl, basestring):
return retContent
elif len(sl) < 3:
return retContent
synLeipzig = libleipzig.Thesaurus(sl, 150)
if not synLeipzig:
return retContent
else:
for aSyn in synLeipzig:
retContent.append(str(aSyn[0]))
retSaveMysql += ";S:"+(str(aSyn[0]))
if len(retSaveMysql) > 5:
raw = json.dumps(retSaveMysql)
loggit = "INSERT INTO synonym_leipzig(raw,uid) VALUES(%s, %s)"
try:
cursorMysql.execute(loggit, (raw, 0))
mysql.commit()
except MySQLdb.ProgrammingError:
print("Function -getSynLeipzig()- failed: The following mysql query failed:")
print(loggit)
data = []
return retContent
# sent_tokenize_list = sent_tokenize(text)
# Summarize the text first and then work on it
tSumy = summarizeText(text)
tokens = nltk.word_tokenize(tSumy)
tokensRaw = nltk.word_tokenize(text)
count = -1
changeEveryWord = 8 #Leistungsschutzrecht: 7 Zeichen dürfen genutzt werden, darüber muss geändert werden
changeEveryWordFlag = 0
changeEveryWordTemp = 0 #temporary upcount
for word in tokens:
count += 1
wordTemp = word.encode('ascii', 'ignore')
# cursorMysql.execute("SELECT * FROM (namen_table) WHERE name LIKE '%s%%' LIMIT 1;" % (word))
cursorMysql.execute("SELECT * FROM (namen_table) WHERE BINARY `name` = '%s' LIMIT 1;" % (wordTemp))
name_content = cursorMysql.fetchone()
#print ("SELECT * FROM (namen_table) WHERE name LIKE '%s' LIMIT 1;" % (word))
#print (name_content)
# search_query = Search(indexes=['onetipp_name'], config=SphinxitConfig)
# # search_query = search_query.match(word).options(
# search_query = search_query.match(word).options(
# ranker='proximity_bm25',
# max_matches=1,
# max_query_time=350,
# field_weights={'name': 100, 'gender': -10000, 'language': -10000, 'meaning': -10000},
# )
###sphinx_result = search_query.ask()
# exit(0)
# es wurde ein namen gefunden -> kein synonym austauschen
if name_content is not None:
# print("Token: ", tokens)
#print("Count: ", count)
#print("
")
#print("Tokencount overall: ", len(tokens))
#print("
")
# tokens[count] = '' + deumlaut(word) + ''
tokens[count] = deumlaut(word)
tokensRaw[count] = deumlaut(word)
# print "Namen erkannt und nicht getauscht"
continue
else:
1
if changeEveryWordTemp == (changeEveryWord - 1):
changeEveryWordFlag = 0
changeEveryWordTemp = 0
else:
1
if changeEveryWordFlag == 1:
changeEveryWordTemp += 1
else:
1
if len(word) >=4 and changeEveryWordFlag == 0:
# Versuche zuerst die Leipzig DB anzufordern
lstcWord = word[0:1]
synDictLeipzig = {}
sLeipzigList = getSynLeipzig(word)
if sLeipzigList:
for wSynL in sLeipzigList:
#synDict[SynRanker(wSyn, word)] = wSyn
if wSynL not in noDoubleHash:
synDictLeipzig[wSynL] = SynRanker(wSynL, word)
sortedSynList = []
sortedSynList = sorted(synDictLeipzig.items(), key=lambda x: x[1], reverse=True)
firstBestSynHit = str(sortedSynList[0][0])
firstBestSynHitRank = str(sortedSynList[0][1])
# Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross
if re.search(re_match, tokens[count-1]) is not None:
firstBestSynHit.title()
if word.endswith('.'):
firstBestSynHit += '.'
elif word.endswith('?'):
firstBestSynHit += '?'
elif word.endswith('!'):
firstBestSynHit += '!'
elif word.endswith(','):
firstBestSynHit += ','
elif word.endswith(';'):
firstBestSynHit += ';'
elif word.endswith(':'):
firstBestSynHit += ':'
# later: Randomly choose one of the synonyms that have all the highest rating
# tokens[count] = '' + deumlaut(
# firstBestSynHit) + ''
tokens[count] = deumlaut(firstBestSynHit)
noDoubleHash.add(firstBestSynHit)
tokensRaw[count] = deumlaut(firstBestSynHit)
changeEveryWordFlag = 1
changeEveryWordTemp += 1
else:
#nutze unsere lokale Synonym Mysql Datenbank
search_query_syn = Search(indexes=['onetipp_syn_simple'], config=SphinxitConfig)
search_query_syn = search_query_syn.match(word).options(
ranker='proximity_bm25',
max_matches=1,
max_query_time=350,
field_weights={'synonyms': 100},
)
sphinx_result_syn = search_query_syn.ask()
synID = 0
try:
synID = sphinx_result_syn['result']['items'][0].values()[0]
if synID > 0:
# print "SynDB has been found: ", synID
#später finde via sphinx noch mehr synonyme und parse diese alle
sql = "SELECT synonyms FROM (synonym_unique_simple) WHERE uid= %s" % (synID)
cursorMysql.execute(sql)
syn_content = cursorMysql.fetchone()
synContent = list(syn_content)
synContent = synContent[0].decode(encoding="utf-8", errors="ignore")
if syn_content:
synwords = synContent.split(";")
# print SynDictCalculator(synwords)
# http://www.saltycrane.com/blog/2007/09/how-to-sort-python-dictionary-by-keys/
# for key, value in sorted(mydict.iteritems(), key=lambda (k,v): (v,k)):
# print "%s: %s" % (key, value)
synDict = {}
for wSyn in synwords:
#synDict[SynRanker(wSyn, word)] = wSyn
if wSyn not in noDoubleHash:
synDict[wSyn] = SynRanker(wSyn, word)
sortedSynList = []
sortedSynList = sorted(synDict.items(), key=lambda x: x[1], reverse=True)
firstBestSynHit = str(sortedSynList[0][0])
firstBestSynHitRank = str(sortedSynList[0][1])
# Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross
if re.search(re_match, tokens[count-1]) is not None:
firstBestSynHit.title()
if word.endswith('.'):
firstBestSynHit += '.'
elif word.endswith('?'):
firstBestSynHit += '?'
elif word.endswith('!'):
firstBestSynHit += '!'
elif word.endswith(','):
firstBestSynHit += ','
elif word.endswith(';'):
firstBestSynHit += ';'
elif word.endswith(':'):
firstBestSynHit += ':'
# later: Randomly choose one of the synonyms that have all the highest rating
# tokens[count] = '' + deumlaut(firstBestSynHit) + ''
tokens[count] = deumlaut(firstBestSynHit)
noDoubleHash.add(firstBestSynHit)
tokensRaw[count] = deumlaut(firstBestSynHit)
changeEveryWordFlag = 1
changeEveryWordTemp += 1
#break
except IndexError:
1
# file schreiben
outputtext = ' '.join(tokens)
outputtextRaw = ' '.join(tokensRaw)
readabilityVar = str(textstat.flesch_reading_ease(outputtextRaw))
with codecs.open(outputfile, 'w') as f:
f.write(outputtext )
# f.write("Lesbarkeitswert : " + readabilityVar)
#f.write("
")
#f.write(outputtext)
#f.write("
")
#f.write("RUSSISCHE TRANSLITERATION: BEISPIEL VERSION")
#f.write("
")
#f.write(translit(outputtextRaw, 'ru'))
f.close()
mysql.commit()
mysql.close()
exit(0)
"""
The Flesch Reading Ease formula
function name - flesch_reading_ease(text)
returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document.
90-100 : Very Easy
80-89 : Easy
70-79 : Fairly Easy
60-69 : Standard
50-59 : Fairly Difficult
30-49 : Difficult
0-29 : Very Confusing
"""