#!/usr/bin/python
# -*- coding: latin1 -*-
from __future__ import division
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
###########################
### Autor: Sebastian Enger / M.Sc.
### Copyright: Sebastian Enger
### Licence: Commercial / OneTipp
### Version: 1.0.4a - 14-10-2015@22:53 Uhr
### Contact: sebastian.enger@gmail.com
### OneTipp Text Tool in Python
###########################
######## export PYTHON_EGG_CACHE=/tmp
import pprint
import os
import nltk
# import rocksdb # shared library kann aktuell noch nicht gelesen werden
import MySQLdb # apt-get install python-mysqldb
from sphinxit.core.processor import Search # http://sphinxit.readthedocs.org/en/latest/
from sphinxit.core.helpers import BaseSearchConfig
from random import randint
import codecs
import sys
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
# import smrzr # https://github.com/lekhakpadmanabh/Summarizer
import re
os.environ['PYTHON_EGG_CACHE'] = '/home/compress/'
###python -m nltk.downloader -d /usr/share/nltk_data all
####python -m nltk.downloader all
###########nltk.download()
# nltk.download("punkt")
reload(sys)
sys.setdefaultencoding('latin-1')
class SphinxitConfig(BaseSearchConfig):
DEBUG = False
WITH_META = False
WITH_STATUS = False
POOL_SIZE = 5
# SQL_ENGINE = 'oursql'
SEARCHD_CONNECTION = {
'host': '127.0.0.1',
'port': 9977,
}
# delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r']
# http://pyrocksdb.readthedocs.org/en/v0.4/tutorial/index.html
# https://github.com/sphinxsearch/sphinx/blob/master/api/sphinxapi.py
# http://www.tutorialspoint.com/python/python_database_access.htm
# mysql = MySQLdb.connect("localhost","root","###########99","onetipp" ) # last working
sphinx = MySQLdb.connect(
host='127.0.0.1',
user='root',
passwd='###########99',
db='onetipp',
port=9977) # sphinxQL
cursorSphinx = sphinx.cursor()
mysql = MySQLdb.connect(
host='127.0.0.1',
user='root',
passwd='###########99',
db='onetipp',
port=3306) # Mysql
cursorMysql = mysql.cursor()
def deumlaut(s):
"""
Replaces umlauts with fake-umlauts
"""
s = s.replace('\xdf', 'ss')
s = s.replace('\xfc', 'ue')
s = s.replace('\xdc', 'Ue')
s = s.replace('\xf6', 'oe')
s = s.replace('\xd6', 'Oe')
s = s.replace('\xe4', 'ae')
s = s.replace('\xc4', 'Ae')
return s
def summarizeText(s):
## sumy: https://github.com/miso-belica/sumy/tree/dev/sumy/summarizers
sentences = nltk.sent_tokenize(s)
sentenceCount = len(sentences)
randSentenceCount = randint(int(sentenceCount - 5), sentenceCount)
# randCount = random.randint(iround(float((sentenceCount / 100) * 55)), iround(sentenceCount))
parser = PlaintextParser.from_string(s, Tokenizer("german"))
stemmer = Stemmer("german")
# summarizer = TextRankSummarizer(stemmer)
summarizer = Summarizer(stemmer)
summary = summarizer(parser.document, randSentenceCount)
returnText = ""
for sentence in summary:
returnText += str(sentence)
returnText += " "
return returnText
# Todos:
# create a stopword list in German
# if a stopword is part of a synonym
# give bad minus points
def SynRanker(s,t):
startVal = float(1.0)
lenSyn = len(s)
synHasDigits = any(i.isdigit() for i in s)
synhasSonder = False
delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r']
re_sonder = r"(\?|\.|\,|\;|\:|\!|\d)"
re_space = r"(\t|\r|\n|\s|\w)"
if s == t:
startVal -= -0.95
return -1
else:
print
if lenSyn <= 0:
startVal -= -0.99
return -10
else:
print
if lenSyn >= 3 and lenSyn < 14:
startVal += 0
elif lenSyn < 3:
startVal -= 0.65
else:
print
if (' ' in s) and lenSyn >= 14:
startVal -= 0.75
elif (' ' in s) and lenSyn < 14:
startVal -= 0.55
elif (' ' not in s) and lenSyn >= 14:
startVal -= 0.05
elif (' ' not in s) and lenSyn < 14:
startVal += 0.05
elif re.search(re_space, s) is not None:
startVal -= 0.68
else:
print
if re.search(re_sonder, s) is not None:
startVal -= 0.12
synhasSonder = True
else:
print
print("Synonym: ", s)
print("
")
print("Length: ", lenSyn)
print("
")
print("Digits: ", synHasDigits)
print("
")
print("Space: ", (' ' in s))
print("
")
print("Sonderzeichen: ", synhasSonder)
print("
")
print("SynRank: ", startVal)
print("
")
print("---------------------------------------------------
")
# later ResultCodes
return float(startVal)
def SynDictCalculator(s):
synDict = {}
scount = 0
for cSyn in s:
rank = SynRanker(cSyn)
synDict[rank] = cSyn
scount += 1
return synDict
def iround(x):
"""iround(number) -> integer
Round a number to the nearest integer."""
return int(round(x) - .5) + (x > 0)
inputfile = sys.argv[1]
outputfile = sys.argv[2]
# http://www.tutorialspoint.com/python/python_command_line_arguments.htm
# read file into string
text = open(inputfile, 'r').read()
text.decode('latin-1')
# sent_tokenize_list = sent_tokenize(text)
# Summarize the text first and then work on it
tSumy = summarizeText(text)
tokens = nltk.word_tokenize(tSumy)
count = -1
for word in tokens:
count += 1
lstcWord = word[0:1]
# if word.istitle():
# if lstcWord.isupper():
if len(word) >= 5:
# 1. check if NamensDB eintrag -> y: write protect this entry
# 2. check if Synonym_Unique -> y: take syononmy rand[0-4] -> 4 if > then 4 synonyms
search_query = Search(indexes=['onetipp_name'], config=SphinxitConfig)
# search_query = search_query.match(word).options(
search_query = search_query.match(word).options(
ranker='proximity_bm25',
max_matches=1,
max_query_time=350,
field_weights={'name': 100, 'gender': -10000, 'language': -10000, 'meaning': -10000},
)
###sphinx_result = search_query.ask()
# exit(0)
# cursorMysql.execute("SELECT * FROM (namen_table) WHERE name LIKE '%s%%' LIMIT 1;" % (word))
cursorMysql.execute("SELECT * FROM (namen_table) WHERE name LIKE '%s' LIMIT 1;" % (word))
name_content = cursorMysql.fetchone()
# print word +" = WORT und NAMENHIT =", name_content
# print "\n"
# exit(0)
skip = 0
# es wurde ein namen gefunden -> kein synonym austauschen
# print "Skip Name ID pre: " , skip
# print "
"
if name_content is None:
# skip = sphinx_result['result']['items'][0].values()[0]
# print word + " >>>> Skip Name ID nachdem gucken ob NamensDB Match: " , skip
# print "
"
# es wurde KEIN namen gefunden -> synonym austauschen
# print "(YES) Skip Name ID Wir können Synonym Match Starten: " , skip
# print "
"
search_query_syn = Search(indexes=['onetipp_syn_simple'], config=SphinxitConfig)
search_query_syn = search_query_syn.match(word).options(
ranker='proximity_bm25',
max_matches=1,
max_query_time=350,
field_weights={'synonyms': 100},
)
sphinx_result_syn = search_query_syn.ask()
# pp.pprint(sphinx_result_syn)
# http://stackoverflow.com/questions/7971618/python-return-first-n-keyvalue-pairs-from-dict
# print "es wurde kein name gefunden: "
synID = 0
try:
synID = sphinx_result_syn['result']['items'][0].values()[0]
if synID > 0:
# print "SynDB has been found: ", synID
sql = "SELECT synonyms FROM (synonym_unique_simple) WHERE uid= %s" % (synID)
cursorMysql.execute(sql)
syn_content = cursorMysql.fetchone()
if syn_content:
synwords = syn_content[0].split(";")
# print SynDictCalculator(synwords)
# http://www.saltycrane.com/blog/2007/09/how-to-sort-python-dictionary-by-keys/
# for key, value in sorted(mydict.iteritems(), key=lambda (k,v): (v,k)):
# print "%s: %s" % (key, value)
synDict = {}
for wSyn in synwords:
#synDict[SynRanker(wSyn, word)] = wSyn
synDict[wSyn] = SynRanker(wSyn, word)
sortedSynList = []
sortedSynList = sorted(synDict.items(), key=lambda x: x[1], reverse=True)
#print(sortedSynList)
#print("
Best Key: ", type(sortedSynList[0]))
#print("
Best Value: ", sortedSynList[0][0])
# later: Randomly choose one of the synonyms that have all the highest rating
# if first char of syn is uppercase than take it
# http://www.tutorialspoint.com/python/python_basic_operators.htm
for cSyn in synwords:
if len(cSyn) < 25:
#print(word + " = Originalwort -<>- Synonym > " + cSyn + "
")
SynRanker(cSyn, word)
lstcSyn = cSyn[0:1]
cSyn = deumlaut(cSyn)
if lstcSyn.isupper() and lstcWord.isupper():
tokens[
count] = '' + cSyn + ''
# print "BIG HIT: " + cSyn + "
"
break
elif lstcSyn.islower() and lstcWord.islower():
tokens[
count] = '' + cSyn + ''
# print "small hit: " + cSyn + "
"
break
except IndexError:
print
else:
if lstcWord.isupper():
tokens[count] = '' + deumlaut(word) + ''
# print "Namen erkannt und nicht getauscht"
# file schreiben
outputtext = ' '.join(tokens)
with codecs.open(outputfile, 'w') as f:
f.write(outputtext)
f.close()
mysql.close()
# print outputtext
exit(0);