#!/usr/bin/python # -*- coding: ISO-8859-1 -*- from __future__ import division from __future__ import absolute_import from __future__ import division, print_function, unicode_literals ########################### ### Autor: Sebastian Enger / M.Sc. ### Copyright: Sebastian Enger ### Licence: Commercial / OneTipp ### Version: 1.0.6 - 15-10-2015@23:53 Uhr ### Contact: sebastian.enger@gmail.com ### OneTipp Text Tool in Python ########################### ######## export PYTHON_EGG_CACHE=/tmp import pprint import os import nltk # import rocksdb # shared library kann aktuell noch nicht gelesen werden import MySQLdb # apt-get install python-mysqldb from sphinxit.core.processor import Search # http://sphinxit.readthedocs.org/en/latest/ from sphinxit.core.helpers import BaseSearchConfig from random import randint from past.builtins import basestring # pip install future import codecs import sys from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer import re from transliterate import translit, get_available_language_codes import libleipzig import pprint import json os.environ['PYTHON_EGG_CACHE'] = '/home/compress/' ###python -m nltk.downloader -d /usr/share/nltk_data all ####python -m nltk.downloader all ###########nltk.download() # nltk.download("punkt") reload(sys) sys.setdefaultencoding('utf-8') noDoubleHash = set() # lies die Ein und Ausgabedateien inputfile = sys.argv[1] outputfile = sys.argv[2] # http://www.tutorialspoint.com/python/python_command_line_arguments.htm # read file into string text = open(inputfile, 'r').read() #text.decode('utf-8') text = text.decode("utf-8") class SphinxitConfig(BaseSearchConfig): DEBUG = False WITH_META = False WITH_STATUS = False POOL_SIZE = 5 # SQL_ENGINE = 'oursql' SEARCHD_CONNECTION = { 'host': '127.0.0.1', 'port': 9977, } # delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r'] # http://pyrocksdb.readthedocs.org/en/v0.4/tutorial/index.html # https://github.com/sphinxsearch/sphinx/blob/master/api/sphinxapi.py # http://www.tutorialspoint.com/python/python_database_access.htm # mysql = MySQLdb.connect("localhost","root","###########99","onetipp" ) # last working sphinx = MySQLdb.connect( host='127.0.0.1', user='root', passwd='###########99', db='onetipp', port=9977) # sphinxQL cursorSphinx = sphinx.cursor() mysql = MySQLdb.connect( host='127.0.0.1', user='root', passwd='###########99', db='onetipp', port=3306) # Mysql mysql.autocommit(True) cursorMysql = mysql.cursor() def log_warnings(curs): for msg in curs.messages: if msg[0] == MySQLdb.Warning: logging.warn(msg[1]) def deumlaut(s): """ Replaces umlauts with fake-umlauts """ s = s.replace('\xdf', 'ss') s = s.replace('\xfc', 'ue') s = s.replace('\xdc', 'Ue') s = s.replace('\xf6', 'oe') s = s.replace('\xd6', 'Oe') s = s.replace('\xe4', 'ae') s = s.replace('\xc4', 'Ae') return s def summarizeText(s): ## sumy: https://github.com/miso-belica/sumy/tree/dev/sumy/summarizers sentences = nltk.sent_tokenize(s) sentenceCount = len(sentences) randSentenceCount = randint(int((sentenceCount/100)*75), sentenceCount) # randCount = random.randint(iround(float((sentenceCount / 100) * 55)), iround(sentenceCount)) parser = PlaintextParser.from_string(s, Tokenizer("german")) stemmer = Stemmer("german") # summarizer = TextRankSummarizer(stemmer) summarizer = Summarizer(stemmer) summary = summarizer(parser.document, randSentenceCount) returnText = "" #ISO-8859-1 for sentence in summary: returnText += str(sentence) returnText += " " return returnText # Todos: # create a stopword list in German # if a stopword is part of a synonym # give bad minus points def SynRanker(s,t): if not s or not t: return -10 else: 1 if not isinstance(s, basestring) or not isinstance(t, basestring): return -10 else: 1 startVal = float(1.0) lenSyn = len(s) synHasDigits = any(i.isdigit() for i in s) synhasSonder = False delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r'] re_sonder = r"(\?|\.|\,|\;|\:|\!|\d)" re_space = r"(\t|\r|\n|\s|\w)" firstS = s[0:1] firstT = t[0:1] if s == t: startVal -= -0.95 return -1 else: 1 if lenSyn <= 0: startVal -= -0.99 return -10 else: 1 if lenSyn >= 3 and lenSyn < 14: startVal += 0 elif lenSyn < 3: startVal -= 0.35 else: 1 if (' ' in s) and lenSyn >= 14: startVal -= 0.75 elif (' ' in s) and lenSyn < 14: startVal -= 0.55 elif (' ' not in s) and lenSyn >= 14: startVal -= 0.05 elif (' ' not in s) and lenSyn < 14: startVal += 0.05 else: 1 if re.search(re_space, s) is not None: startVal -= 0.50 else: 1 if re.search(re_sonder, s) is not None: startVal -= 0.075 synhasSonder = True else: 1 if firstS.isupper() and firstT.isupper(): startVal += 0.15 elif firstS.islower() and firstT.islower(): startVal += 0.15 elif firstS.isupper() and not firstT.isupper(): startVal -= 0.25 elif firstS.islower() and not firstT.islower(): startVal -= 0.25 else: 1 #print("Synonym: ", s) #print("
") #print("Length: ", lenSyn) #print("
") # print("Digits: ", synHasDigits) #print("
") #print("Space: ", (' ' in s)) #print("
") #print("Sonderzeichen: ", synhasSonder) #print("
") #print("SynRank: ", startVal) #print("
") #print("---------------------------------------------------
") # later ResultCodes return float(startVal) def iround(x): """iround(number) -> integer Round a number to the nearest integer.""" return int(round(x) - .5) + (x > 0) def getSynLeipzig(sl): #print ("Auto Syn - Leipzig: ", libleipzig.Thesaurus("Auto",10)) retContent = [] retSaveMysql = "W:"+sl if not sl: return retContent elif not isinstance(sl, basestring): return retContent elif len(sl) < 3: return retContent synLeipzig = libleipzig.Thesaurus(sl, 50) if not synLeipzig: return retContent else: for aSyn in synLeipzig: retContent.append(str(aSyn[0])) retSaveMysql += ";S:"+(str(aSyn[0])) if len(retSaveMysql) > 5: raw = json.dumps(retSaveMysql) loggit = "INSERT INTO synonym_leipzig(raw,uid) VALUES(%s, %s)" try: cursorMysql.execute(loggit, (raw, 0)) mysql.commit() except MySQLdb.ProgrammingError: print("Function -getSynLeipzig()- failed: The following mysql query failed:") print(loggit) data = [] return retContent # sent_tokenize_list = sent_tokenize(text) # Summarize the text first and then work on it tSumy = summarizeText(text) tokens = nltk.word_tokenize(tSumy) tokensRaw = nltk.word_tokenize(tSumy) count = -1 changeEveryWord = 9 #Leistungsschutzrecht: 7 Zeichen dürfen genutzt werden, darüber muss geändert werden changeEveryWordFlag = 0 changeEveryWordTemp = 0 #temporary upcount for word in tokens: count += 1 # cursorMysql.execute("SELECT * FROM (namen_table) WHERE name LIKE '%s%%' LIMIT 1;" % (word)) cursorMysql.execute("SELECT * FROM (namen_table) WHERE BINARY `name` = '%s' LIMIT 1;" % (word)) name_content = cursorMysql.fetchone() #print ("SELECT * FROM (namen_table) WHERE name LIKE '%s' LIMIT 1;" % (word)) #print (name_content) # search_query = Search(indexes=['onetipp_name'], config=SphinxitConfig) # # search_query = search_query.match(word).options( # search_query = search_query.match(word).options( # ranker='proximity_bm25', # max_matches=1, # max_query_time=350, # field_weights={'name': 100, 'gender': -10000, 'language': -10000, 'meaning': -10000}, # ) ###sphinx_result = search_query.ask() # exit(0) # es wurde ein namen gefunden -> kein synonym austauschen if name_content is not None: # print("Token: ", tokens) #print("Count: ", count) #print("
") #print("Tokencount overall: ", len(tokens)) #print("
") tokens[count] = '' + deumlaut(word) + '' tokensRaw[count] = deumlaut(word) # print "Namen erkannt und nicht getauscht" continue else: 1 if changeEveryWordTemp == (changeEveryWord - 1): changeEveryWordFlag = 0 changeEveryWordTemp = 0 else: 1 if changeEveryWordFlag == 1: changeEveryWordTemp += 1 else: 1 if len(word) >= 2 and changeEveryWordFlag == 0: # Versuche zuerst die Leipzig DB anzufordern lstcWord = word[0:1] synDictLeipzig = {} sLeipzigList = getSynLeipzig(word) if sLeipzigList: for wSynL in sLeipzigList: #synDict[SynRanker(wSyn, word)] = wSyn if wSynL not in noDoubleHash: synDictLeipzig[wSynL] = SynRanker(wSynL, word) sortedSynList = [] sortedSynList = sorted(synDictLeipzig.items(), key=lambda x: x[1], reverse=True) firstBestSynHit = str(sortedSynList[0][0]) firstBestSynHitRank = str(sortedSynList[0][1]) # later: Randomly choose one of the synonyms that have all the highest rating tokens[count] = '' + deumlaut(firstBestSynHit) + '' noDoubleHash.add(firstBestSynHit) tokensRaw[count] = deumlaut(firstBestSynHit) changeEveryWordFlag = 1 changeEveryWordTemp += 1 else: #nutze unsere lokale Synonym Mysql Datenbank search_query_syn = Search(indexes=['onetipp_syn_simple'], config=SphinxitConfig) search_query_syn = search_query_syn.match(word).options( ranker='proximity_bm25', max_matches=1, max_query_time=350, field_weights={'synonyms': 100}, ) sphinx_result_syn = search_query_syn.ask() synID = 0 try: synID = sphinx_result_syn['result']['items'][0].values()[0] if synID > 0: # print "SynDB has been found: ", synID #später finde via sphinx noch mehr synonyme und parse diese alle sql = "SELECT synonyms FROM (synonym_unique_simple) WHERE uid= %s" % (synID) cursorMysql.execute(sql) syn_content = cursorMysql.fetchone() synContent = list(syn_content) synContent = synContent[0].decode(encoding="utf-8", errors="ignore") if syn_content: synwords = synContent.split(";") # print SynDictCalculator(synwords) # http://www.saltycrane.com/blog/2007/09/how-to-sort-python-dictionary-by-keys/ # for key, value in sorted(mydict.iteritems(), key=lambda (k,v): (v,k)): # print "%s: %s" % (key, value) synDict = {} for wSyn in synwords: #synDict[SynRanker(wSyn, word)] = wSyn if wSyn not in noDoubleHash: synDict[wSyn] = SynRanker(wSyn, word) sortedSynList = [] sortedSynList = sorted(synDict.items(), key=lambda x: x[1], reverse=True) firstBestSynHit = str(sortedSynList[0][0]) firstBestSynHitRank = str(sortedSynList[0][1]) # later: Randomly choose one of the synonyms that have all the highest rating tokens[count] = '' + deumlaut(firstBestSynHit) + '' noDoubleHash.add(firstBestSynHit) tokensRaw[count] = deumlaut(firstBestSynHit) changeEveryWordFlag = 1 changeEveryWordTemp += 1 #break except IndexError: print # file schreiben outputtext = ' '.join(tokens) outputtextRussia = ' '.join(tokensRaw) with codecs.open(outputfile, 'w') as f: f.write(outputtext) f.write("


") f.write("RUSSISCHE TRANSLITERATION:BEISPIEL VERSION") f.write("



") f.write(translit(outputtextRussia, 'ru')) f.close() mysql.commit() mysql.close() exit(0) # re_sonder = r"[(\?|\.|\!)]$(\)" #re_space = r"(\t|\r|\n|\s|\w)"