#!/usr/bin/python
# -*- coding: ISO-8859-1 -*-
from __future__ import division
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
###########################
### Autor: Sebastian Enger / M.Sc.
### Copyright: Sebastian Enger
### Licence: Commercial / OneTipp
### Version: 1.0.8a - 22-10-2015@23:53 Uhr
### Contact: sebastian.enger@gmail.com
### OneTipp Text Tool in Python: Main File
###########################
"""
Synonym bzw Wortersetzung parallelisieren für schnellere Verarbeitung und Reaktionszeit des Tools
Antonym Datenbank Entwicklung mit Hilfe gecrawlter Websites
Aufbau einer Datenbank mit einfacher deutscher Sprache
Berechnung des lesbarkeitswerts eines eingabetextes - basierend auf einfachen Texten die "simple German " Datenbank für Austausch nutzen, Wissenschaftliche Texte mit Leipzig und unserer lokalen Synonym Datenbank austauschen
"""
import pprint
#https://docs.python.org/2/library/configparser.html
import os
import sys
reload(sys)
sys.path.append('/home/onetipp/python/modules')
os.environ['PYTHON_EGG_CACHE'] = '/home/compress/'
import random
import sphinxapi
import codecs
import re
from transliterate import translit, get_available_language_codes
import onetipp
#client = sphinxapi.SphinxClient()
#client.SetServer('127.0.0.1', 9312)
cursorMysql = onetipp.mysql.cursor()
noDoubleHash = set()
###re_match = r"[(\?|\.|\!)][(\t|\r|\n|\s|\w){0,}]([A-Za-z0-9]{1,})" # Match: ". WORT"
re_match = r"(\?|\.|\!)" # Match: ". WORT"
# lies die Ein und Ausgabedateien
inputfile = sys.argv[1]
outputfile = sys.argv[2]
# http://www.tutorialspoint.com/python/python_command_line_arguments.htm
# read file into string
text = codecs.open(inputfile, "r", encoding='utf-8').read()
#text.decode('utf-8')
#text = text.decode("utf-8")
# sent_tokenize_list = sent_tokenize(text)
# Summarize the text first and then work on it
tSumy = onetipp.summarizeText(text)
tokens = onetipp.nltk.word_tokenize(tSumy)
tokensRaw = onetipp.nltk.word_tokenize(text)
count = -1
changeEveryWord = 6 #Leistungsschutzrecht: 7 Zeichen dürfen genutzt werden, darüber muss geändert werden
changeEveryWordFlag = 0
changeEveryWordTemp = 0 #temporary upcount
ignoreNextWord = 0
for word in tokens:
count += 1
word = onetipp.to_unicode(word)
#wordNext = tokens[count + 1]
# überspringe Wochentage
if word in onetipp.Wochentage:
continue
# mehr als gleich oder groesser vier
elif len(word) < 4:
continue
"""
NAMENSBESTIMMUNG: Bug: der nächsten
"""
if word[0].isupper(): # nur grosse geschriebene Namen testen
#print("Wort wird am Anfang gross geschrieben: ", word)
#print("\n")
#wordTemp = word.encode('utf-8', 'ignore')
# cursorMysql.execute("SELECT * FROM (namen_table) WHERE name LIKE '%s%%' LIMIT 1;" % (word))
try:
cursorMysql.execute("SELECT uid FROM (namen_table) WHERE BINARY `name` = '%s' LIMIT 1;" % (word))
name_content = cursorMysql.fetchone()
# print("Nr: ", count)
# print("\tWord: ", word)
# print("\tSQL Name: ", name_content)
# es wurde ein namen gefunden -> kein synonym austauschen
if name_content is not None:
# print("Das Wort ist ein Name: ", word)
tokens[count] = '' + onetipp.deumlaut(word) + ''
tokensRaw[count] = onetipp.deumlaut(word)
# tokens[count+1] = '' + onetipp.deumlaut(wordNext) + \
# ''
# tokensRaw[count+1] = onetipp.deumlaut(wordNext)
# print("habe namen gefunden, ignoriere nächstes wort: ", word)
# Da Namen oft aus Vorname und Nachname bestehen, soll direkt nach einem Namen auch nicht gewechselt werden
continue
except:
print("Der Namensparser der lokalen SQL Datenbank konnte nicht angesprochen werden")
name_content = None
# #print ("SELECT * FROM (namen_table) WHERE name LIKE '%s' LIMIT 1;" % (word))
# #print (name_content)
#
# search_query = onetipp.Search(indexes=['onetipp_name'], config=onetipp.SphinxitConfig)
# search_query = search_query.match(word).options(
# ##search_query = search_query.match('@name "word"', raw=True).options(
# #search_query = search_query.match('@name ', (word)).options(
# ranker='proximity_bm25',
# max_matches=1,
# max_query_time=350,
# field_weights={'name': 100}
# )
# sphinx_result = search_query.ask()
# if sphinx_result is not None and len(sphinx_result['result']['items']) >= 1:
# var = sphinx_result['result']['items'][0].values()[0]
# # pprint.pprint(var)
# sql = "SELECT name FROM (namen_table) WHERE uid= %s" % (var)
# onetipp.cursorMysql.execute(sql)
# syn_content = onetipp.cursorMysql.fetchone()
# if syn_content is not None:
# synContent = list(syn_content)
# synContentHit = synContent[0].decode(encoding="utf-8", errors="ignore")
#
# fix_encoding = lambda word: word.decode('utf8', 'ignore')
# vari = client.Query(str(word), index="onetipp_name")
# print("\tSphinx Name: ", vari)
# if word[0].isupper(): # nur grosse geschriebene Namen testen
if changeEveryWordTemp == (changeEveryWord - 1):
changeEveryWordFlag = 0
changeEveryWordTemp = 0
else:
1
if changeEveryWordFlag == 1:
changeEveryWordTemp += 1
else:
1
if changeEveryWordFlag == 0:
# print("IgnoreNextWord: und aktuelles wort: ", ignoreNextWord, word)
# Versuche zuerst die Leipzig DB anzufordern
lstcWord = word[0:1]
synDictLeipzig = {}
sLeipzigList = onetipp.getSynLeipzig(word)
if sLeipzigList:
for wSynL in sLeipzigList:
#synDict[SynRanker(wSyn, word)] = wSyn
if wSynL not in noDoubleHash:
synDictLeipzig[wSynL] = onetipp.SynRanker(wSynL, word)
sortedSynList = []
sortedSynList = sorted(synDictLeipzig.items(), key=lambda x: x[1], reverse=True)
randElement = random.randint(0, len(sortedSynList)-1)
firstBestSynHit = onetipp.to_unicode(sortedSynList[randElement][0])
firstBestSynHitRank = onetipp.to_unicode(sortedSynList[randElement][1])
# print("Wort: ", word)
# print ("SynBestHit: ",firstBestSynHit)
# print(firstBestSynHitRank)
# print(type(firstBestSynHitRank))
# if firstBestSynHitRank < 0:
# print("Negative")
tmpRank = 0
UseSyn = True
# wenn negatives
if firstBestSynHitRank < 0:
firstBestSynHit = onetipp.to_unicode(sortedSynList[0][0])
tmpRank = onetipp.to_unicode(sortedSynList[0][1])
if tmpRank < 0:
UseSyn = False
# #print(sortedSynList)
# print("Wort: ",word)
# print("Besthit: ", firstBestSynHit)
# print("Best Rank: ", firstBestSynHitRank)
# #print (len(sortedSynList))
# Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross
if re.search(re_match, tokens[count-1]) is not None:
firstBestSynHit.title()
firstBestSynHit = onetipp.putPunctuation(word, firstBestSynHit)
# later: Randomly choose one of the synonyms that have all the highest rating
if UseSyn is False:
tokens[count] = word
tokensRaw[count] = word
elif UseSyn is True:
tokens[count] = '' + onetipp.deumlaut(
firstBestSynHit) + ''
tokensRaw[count] = onetipp.deumlaut(firstBestSynHit)
# keine doppelten Synonyme verwenden
noDoubleHash.add(firstBestSynHit)
changeEveryWordFlag = 1
changeEveryWordTemp += 1
else:
#nutze unsere lokale Synonym Mysql Datenbank
search_query_syn = onetipp.Search(indexes=['onetipp_syn_simple'], config=onetipp.SphinxitConfig)
search_query_syn = search_query_syn.match(word, raw=True).options(
ranker='proximity_bm25',
max_matches=1,
max_query_time=50,
field_weights={'synonyms': 100},
)
sphinx_result_syn = search_query_syn.ask()
synID = 0
if sphinx_result_syn is not None and len(sphinx_result_syn['result']['items']) >= 1:
synID = sphinx_result_syn['result']['items'][0].values()[0]
try:
#print("SynDB has been found: ", synID)
#später finde via sphinx noch mehr synonyme und parse diese alle
sql = "SELECT synonyms FROM (synonym_unique_simple) WHERE uid= %s" % (synID)
cursorMysql.execute(sql)
syn_content = cursorMysql.fetchone()
synContent = list(syn_content)
#synContent = synContent[0].decode(encoding="utf-8", errors="ignore")
synContent = onetipp.to_unicode(synContent[0])
if syn_content is not None:
synwords = synContent.split(";")
synDict = {}
for wSyn in synwords:
#synDict[SynRanker(wSyn, word)] = wSyn
if wSyn not in noDoubleHash:
synDict[wSyn] = onetipp.SynRanker(wSyn, word)
else:
1
else:
1
sortedSynList = []
sortedSynList = sorted(synDict.items(), key=lambda x: x[1], reverse=True)
randElement = random.randint(0, len(sortedSynList)-1)
firstBestSynHit = onetipp.to_unicode(sortedSynList[randElement][0])
firstBestSynHitRank = onetipp.to_unicode(sortedSynList[randElement][1])
tmpRank = 0
UseSyn = True
# wenn negatives
if firstBestSynHitRank < 0:
firstBestSynHit = onetipp.to_unicode(sortedSynList[0][0])
tmpRank = onetipp.to_unicode(sortedSynList[0][1])
if tmpRank < 0:
UseSyn = False
# Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross
if re.search(re_match, tokens[count-1]) is not None:
firstBestSynHit = firstBestSynHit.title()
# print("Schreib das wort gross: ", firstBestSynHit)
firstBestSynHit = onetipp.putPunctuation(word, firstBestSynHit)
if UseSyn is False:
tokens[count] = word
tokensRaw[count] = word
elif UseSyn is True:
tokens[count] = '' + onetipp.deumlaut(
firstBestSynHit) + ''
tokensRaw[count] = onetipp.deumlaut(firstBestSynHit)
# keine doppelten Synonyme verwenden
noDoubleHash.add(firstBestSynHit)
changeEveryWordFlag = 1
changeEveryWordTemp += 1
except:
1
# print("Error while connecting and parsing data from local synonym database: ", word)
# file schreiben
outputtext = ' '.join(tokens)
outputtextRaw = ' '.join(tokensRaw)
readabilityVar = str(onetipp.textstat.flesch_reading_ease(text))
with codecs.open(outputfile, 'w', encoding='utf-8') as f:
f.write(outputtext)
f.close()
with codecs.open(outputfile+".raw.txt", 'w', encoding='utf-8') as f:
f.write(outputtextRaw)
# f.write("Lesbarkeitswert : " + readabilityVar)
#f.write("
")
#f.write(outputtext)
#f.write("
")
#f.write("RUSSISCHE TRANSLITERATION: BEISPIEL VERSION")
#f.write("
")
#f.write(translit(outputtextRaw, 'ru'))
f.close()
onetipp.mysql.commit()
onetipp.mysql.close()
exit(0)
"""
The Flesch Reading Ease formula
function name - flesch_reading_ease(text)
returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document.
90-100 : Very Easy
80-89 : Easy
70-79 : Fairly Easy
60-69 : Standard
50-59 : Fairly Difficult
30-49 : Difficult
0-29 : Very Confusing
"""