#!/usr/bin/python # -*- coding: ISO-8859-1 -*- from __future__ import division from __future__ import absolute_import from __future__ import division, print_function, unicode_literals ########################### ### Autor: Sebastian Enger / M.Sc. ### Copyright: Sebastian Enger ### Licence: Commercial / OneTipp ### Version: 1.0.8a - 22-10-2015@23:53 Uhr ### Contact: sebastian.enger@gmail.com ### OneTipp Text Tool in Python: Main File ########################### """ Synonym bzw Wortersetzung parallelisieren für schnellere Verarbeitung und Reaktionszeit des Tools Antonym Datenbank Entwicklung mit Hilfe gecrawlter Websites Aufbau einer Datenbank mit einfacher deutscher Sprache Berechnung des lesbarkeitswerts eines eingabetextes - basierend auf einfachen Texten die "simple German " Datenbank für Austausch nutzen, Wissenschaftliche Texte mit Leipzig und unserer lokalen Synonym Datenbank austauschen """ import pprint #https://docs.python.org/2/library/configparser.html import os import sys reload(sys) sys.path.append('/home/onetipp/python/modules') os.environ['PYTHON_EGG_CACHE'] = '/home/compress/' import random import sphinxapi import codecs import re from transliterate import translit, get_available_language_codes import onetipp #client = sphinxapi.SphinxClient() #client.SetServer('127.0.0.1', 9312) cursorMysql = onetipp.mysql.cursor() noDoubleHash = set() ###re_match = r"[(\?|\.|\!)][(\t|\r|\n|\s|\w){0,}]([A-Za-z0-9]{1,})" # Match: ". WORT" re_match = r"(\?|\.|\!)" # Match: ". WORT" # lies die Ein und Ausgabedateien inputfile = sys.argv[1] outputfile = sys.argv[2] # http://www.tutorialspoint.com/python/python_command_line_arguments.htm # read file into string text = codecs.open(inputfile, "r", encoding='utf-8').read() #text.decode('utf-8') #text = text.decode("utf-8") # sent_tokenize_list = sent_tokenize(text) # Summarize the text first and then work on it tSumy = onetipp.summarizeText(text) tokens = onetipp.nltk.word_tokenize(tSumy) tokensRaw = onetipp.nltk.word_tokenize(text) count = -1 changeEveryWord = 6 #Leistungsschutzrecht: 7 Zeichen dürfen genutzt werden, darüber muss geändert werden changeEveryWordFlag = 0 changeEveryWordTemp = 0 #temporary upcount ignoreNextWord = 0 for word in tokens: count += 1 word = onetipp.to_unicode(word) #wordNext = tokens[count + 1] # überspringe Wochentage if word in onetipp.Wochentage: continue # mehr als gleich oder groesser vier elif len(word) < 4: continue """ NAMENSBESTIMMUNG: Bug: der nächsten """ if word[0].isupper(): # nur grosse geschriebene Namen testen #print("Wort wird am Anfang gross geschrieben: ", word) #print("\n") #wordTemp = word.encode('utf-8', 'ignore') # cursorMysql.execute("SELECT * FROM (namen_table) WHERE name LIKE '%s%%' LIMIT 1;" % (word)) try: cursorMysql.execute("SELECT uid FROM (namen_table) WHERE BINARY `name` = '%s' LIMIT 1;" % (word)) name_content = cursorMysql.fetchone() # print("Nr: ", count) # print("\tWord: ", word) # print("\tSQL Name: ", name_content) # es wurde ein namen gefunden -> kein synonym austauschen if name_content is not None: # print("Das Wort ist ein Name: ", word) tokens[count] = '' + onetipp.deumlaut(word) + '' tokensRaw[count] = onetipp.deumlaut(word) # tokens[count+1] = '' + onetipp.deumlaut(wordNext) + \ # '' # tokensRaw[count+1] = onetipp.deumlaut(wordNext) # print("habe namen gefunden, ignoriere nächstes wort: ", word) # Da Namen oft aus Vorname und Nachname bestehen, soll direkt nach einem Namen auch nicht gewechselt werden continue except: print("Der Namensparser der lokalen SQL Datenbank konnte nicht angesprochen werden") name_content = None # #print ("SELECT * FROM (namen_table) WHERE name LIKE '%s' LIMIT 1;" % (word)) # #print (name_content) # # search_query = onetipp.Search(indexes=['onetipp_name'], config=onetipp.SphinxitConfig) # search_query = search_query.match(word).options( # ##search_query = search_query.match('@name "word"', raw=True).options( # #search_query = search_query.match('@name ', (word)).options( # ranker='proximity_bm25', # max_matches=1, # max_query_time=350, # field_weights={'name': 100} # ) # sphinx_result = search_query.ask() # if sphinx_result is not None and len(sphinx_result['result']['items']) >= 1: # var = sphinx_result['result']['items'][0].values()[0] # # pprint.pprint(var) # sql = "SELECT name FROM (namen_table) WHERE uid= %s" % (var) # onetipp.cursorMysql.execute(sql) # syn_content = onetipp.cursorMysql.fetchone() # if syn_content is not None: # synContent = list(syn_content) # synContentHit = synContent[0].decode(encoding="utf-8", errors="ignore") # # fix_encoding = lambda word: word.decode('utf8', 'ignore') # vari = client.Query(str(word), index="onetipp_name") # print("\tSphinx Name: ", vari) # if word[0].isupper(): # nur grosse geschriebene Namen testen if changeEveryWordTemp == (changeEveryWord - 1): changeEveryWordFlag = 0 changeEveryWordTemp = 0 else: 1 if changeEveryWordFlag == 1: changeEveryWordTemp += 1 else: 1 if changeEveryWordFlag == 0: # print("IgnoreNextWord: und aktuelles wort: ", ignoreNextWord, word) # Versuche zuerst die Leipzig DB anzufordern lstcWord = word[0:1] synDictLeipzig = {} sLeipzigList = onetipp.getSynLeipzig(word) if sLeipzigList: for wSynL in sLeipzigList: #synDict[SynRanker(wSyn, word)] = wSyn if wSynL not in noDoubleHash: synDictLeipzig[wSynL] = onetipp.SynRanker(wSynL, word) sortedSynList = [] sortedSynList = sorted(synDictLeipzig.items(), key=lambda x: x[1], reverse=True) randElement = random.randint(0, len(sortedSynList)-1) firstBestSynHit = onetipp.to_unicode(sortedSynList[randElement][0]) firstBestSynHitRank = onetipp.to_unicode(sortedSynList[randElement][1]) # print("Wort: ", word) # print ("SynBestHit: ",firstBestSynHit) # print(firstBestSynHitRank) # print(type(firstBestSynHitRank)) # if firstBestSynHitRank < 0: # print("Negative") tmpRank = 0 UseSyn = True # wenn negatives if firstBestSynHitRank < 0: firstBestSynHit = onetipp.to_unicode(sortedSynList[0][0]) tmpRank = onetipp.to_unicode(sortedSynList[0][1]) if tmpRank < 0: UseSyn = False # #print(sortedSynList) # print("Wort: ",word) # print("Besthit: ", firstBestSynHit) # print("Best Rank: ", firstBestSynHitRank) # #print (len(sortedSynList)) # Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross if re.search(re_match, tokens[count-1]) is not None: firstBestSynHit.title() firstBestSynHit = onetipp.putPunctuation(word, firstBestSynHit) # later: Randomly choose one of the synonyms that have all the highest rating if UseSyn is False: tokens[count] = word tokensRaw[count] = word elif UseSyn is True: tokens[count] = '' + onetipp.deumlaut( firstBestSynHit) + '' tokensRaw[count] = onetipp.deumlaut(firstBestSynHit) # keine doppelten Synonyme verwenden noDoubleHash.add(firstBestSynHit) changeEveryWordFlag = 1 changeEveryWordTemp += 1 else: #nutze unsere lokale Synonym Mysql Datenbank search_query_syn = onetipp.Search(indexes=['onetipp_syn_simple'], config=onetipp.SphinxitConfig) search_query_syn = search_query_syn.match(word, raw=True).options( ranker='proximity_bm25', max_matches=1, max_query_time=50, field_weights={'synonyms': 100}, ) sphinx_result_syn = search_query_syn.ask() synID = 0 if sphinx_result_syn is not None and len(sphinx_result_syn['result']['items']) >= 1: synID = sphinx_result_syn['result']['items'][0].values()[0] try: #print("SynDB has been found: ", synID) #später finde via sphinx noch mehr synonyme und parse diese alle sql = "SELECT synonyms FROM (synonym_unique_simple) WHERE uid= %s" % (synID) cursorMysql.execute(sql) syn_content = cursorMysql.fetchone() synContent = list(syn_content) #synContent = synContent[0].decode(encoding="utf-8", errors="ignore") synContent = onetipp.to_unicode(synContent[0]) if syn_content is not None: synwords = synContent.split(";") synDict = {} for wSyn in synwords: #synDict[SynRanker(wSyn, word)] = wSyn if wSyn not in noDoubleHash: synDict[wSyn] = onetipp.SynRanker(wSyn, word) else: 1 else: 1 sortedSynList = [] sortedSynList = sorted(synDict.items(), key=lambda x: x[1], reverse=True) randElement = random.randint(0, len(sortedSynList)-1) firstBestSynHit = onetipp.to_unicode(sortedSynList[randElement][0]) firstBestSynHitRank = onetipp.to_unicode(sortedSynList[randElement][1]) tmpRank = 0 UseSyn = True # wenn negatives if firstBestSynHitRank < 0: firstBestSynHit = onetipp.to_unicode(sortedSynList[0][0]) tmpRank = onetipp.to_unicode(sortedSynList[0][1]) if tmpRank < 0: UseSyn = False # Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross if re.search(re_match, tokens[count-1]) is not None: firstBestSynHit = firstBestSynHit.title() # print("Schreib das wort gross: ", firstBestSynHit) firstBestSynHit = onetipp.putPunctuation(word, firstBestSynHit) if UseSyn is False: tokens[count] = word tokensRaw[count] = word elif UseSyn is True: tokens[count] = '' + onetipp.deumlaut( firstBestSynHit) + '' tokensRaw[count] = onetipp.deumlaut(firstBestSynHit) # keine doppelten Synonyme verwenden noDoubleHash.add(firstBestSynHit) changeEveryWordFlag = 1 changeEveryWordTemp += 1 except: 1 # print("Error while connecting and parsing data from local synonym database: ", word) # file schreiben outputtext = ' '.join(tokens) outputtextRaw = ' '.join(tokensRaw) readabilityVar = str(onetipp.textstat.flesch_reading_ease(text)) with codecs.open(outputfile, 'w', encoding='utf-8') as f: f.write(outputtext) f.close() with codecs.open(outputfile+".raw.txt", 'w', encoding='utf-8') as f: f.write(outputtextRaw) # f.write("Lesbarkeitswert : " + readabilityVar) #f.write("

") #f.write(outputtext) #f.write("

") #f.write("RUSSISCHE TRANSLITERATION: BEISPIEL VERSION") #f.write("

") #f.write(translit(outputtextRaw, 'ru')) f.close() onetipp.mysql.commit() onetipp.mysql.close() exit(0) """ The Flesch Reading Ease formula function name - flesch_reading_ease(text) returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document. 90-100 : Very Easy 80-89 : Easy 70-79 : Fairly Easy 60-69 : Standard 50-59 : Fairly Difficult 30-49 : Difficult 0-29 : Very Confusing """