#!/usr/bin/python # -*- coding: latin1 -*- from __future__ import division from __future__ import absolute_import from __future__ import division, print_function, unicode_literals ########################### ### Autor: Sebastian Enger / M.Sc. ### Copyright: Sebastian Enger ### Licence: Commercial / OneTipp ### Version: 1.0.4a - 14-10-2015@22:53 Uhr ### Contact: sebastian.enger@gmail.com ### OneTipp Text Tool in Python ########################### ######## export PYTHON_EGG_CACHE=/tmp import pprint import os import nltk # import rocksdb # shared library kann aktuell noch nicht gelesen werden import MySQLdb # apt-get install python-mysqldb from sphinxit.core.processor import Search # http://sphinxit.readthedocs.org/en/latest/ from sphinxit.core.helpers import BaseSearchConfig from random import randint import codecs import sys from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer # import smrzr # https://github.com/lekhakpadmanabh/Summarizer import re os.environ['PYTHON_EGG_CACHE'] = '/home/compress/' ###python -m nltk.downloader -d /usr/share/nltk_data all ####python -m nltk.downloader all ###########nltk.download() # nltk.download("punkt") reload(sys) sys.setdefaultencoding('latin-1') class SphinxitConfig(BaseSearchConfig): DEBUG = False WITH_META = False WITH_STATUS = False POOL_SIZE = 5 # SQL_ENGINE = 'oursql' SEARCHD_CONNECTION = { 'host': '127.0.0.1', 'port': 9977, } # delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r'] # http://pyrocksdb.readthedocs.org/en/v0.4/tutorial/index.html # https://github.com/sphinxsearch/sphinx/blob/master/api/sphinxapi.py # http://www.tutorialspoint.com/python/python_database_access.htm # mysql = MySQLdb.connect("localhost","root","###########99","onetipp" ) # last working sphinx = MySQLdb.connect( host='127.0.0.1', user='root', passwd='###########99', db='onetipp', port=9977) # sphinxQL cursorSphinx = sphinx.cursor() mysql = MySQLdb.connect( host='127.0.0.1', user='root', passwd='###########99', db='onetipp', port=3306) # Mysql cursorMysql = mysql.cursor() def deumlaut(s): """ Replaces umlauts with fake-umlauts """ s = s.replace('\xdf', 'ss') s = s.replace('\xfc', 'ue') s = s.replace('\xdc', 'Ue') s = s.replace('\xf6', 'oe') s = s.replace('\xd6', 'Oe') s = s.replace('\xe4', 'ae') s = s.replace('\xc4', 'Ae') return s def summarizeText(s): ## sumy: https://github.com/miso-belica/sumy/tree/dev/sumy/summarizers sentences = nltk.sent_tokenize(s) sentenceCount = len(sentences) randSentenceCount = randint(int(sentenceCount - 5), sentenceCount) # randCount = random.randint(iround(float((sentenceCount / 100) * 55)), iround(sentenceCount)) parser = PlaintextParser.from_string(s, Tokenizer("german")) stemmer = Stemmer("german") # summarizer = TextRankSummarizer(stemmer) summarizer = Summarizer(stemmer) summary = summarizer(parser.document, randSentenceCount) returnText = "" for sentence in summary: returnText += str(sentence) returnText += " " return returnText # Todos: # create a stopword list in German # if a stopword is part of a synonym # give bad minus points def SynRanker(s,t): startVal = float(1.0) lenSyn = len(s) synHasDigits = any(i.isdigit() for i in s) synhasSonder = False delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r'] re_sonder = r"(\?|\.|\,|\;|\:|\!|\d)" re_space = r"(\t|\r|\n|\s|\w)" if s == t: startVal -= -0.95 return -1 else: print if lenSyn <= 0: startVal -= -0.99 return -10 else: print if lenSyn >= 3 and lenSyn < 14: startVal += 0 elif lenSyn < 3: startVal -= 0.65 else: print if (' ' in s) and lenSyn >= 14: startVal -= 0.75 elif (' ' in s) and lenSyn < 14: startVal -= 0.55 elif (' ' not in s) and lenSyn >= 14: startVal -= 0.05 elif (' ' not in s) and lenSyn < 14: startVal += 0.05 elif re.search(re_space, s) is not None: startVal -= 0.68 else: print if re.search(re_sonder, s) is not None: startVal -= 0.12 synhasSonder = True else: print print("Synonym: ", s) print("
") print("Length: ", lenSyn) print("
") print("Digits: ", synHasDigits) print("
") print("Space: ", (' ' in s)) print("
") print("Sonderzeichen: ", synhasSonder) print("
") print("SynRank: ", startVal) print("
") print("---------------------------------------------------
") # later ResultCodes return float(startVal) def SynDictCalculator(s): synDict = {} scount = 0 for cSyn in s: rank = SynRanker(cSyn) synDict[rank] = cSyn scount += 1 return synDict def iround(x): """iround(number) -> integer Round a number to the nearest integer.""" return int(round(x) - .5) + (x > 0) inputfile = sys.argv[1] outputfile = sys.argv[2] # http://www.tutorialspoint.com/python/python_command_line_arguments.htm # read file into string text = open(inputfile, 'r').read() text.decode('latin-1') # sent_tokenize_list = sent_tokenize(text) # Summarize the text first and then work on it tSumy = summarizeText(text) tokens = nltk.word_tokenize(tSumy) count = -1 for word in tokens: count += 1 lstcWord = word[0:1] # if word.istitle(): # if lstcWord.isupper(): if len(word) >= 5: # 1. check if NamensDB eintrag -> y: write protect this entry # 2. check if Synonym_Unique -> y: take syononmy rand[0-4] -> 4 if > then 4 synonyms search_query = Search(indexes=['onetipp_name'], config=SphinxitConfig) # search_query = search_query.match(word).options( search_query = search_query.match(word).options( ranker='proximity_bm25', max_matches=1, max_query_time=350, field_weights={'name': 100, 'gender': -10000, 'language': -10000, 'meaning': -10000}, ) ###sphinx_result = search_query.ask() # exit(0) # cursorMysql.execute("SELECT * FROM (namen_table) WHERE name LIKE '%s%%' LIMIT 1;" % (word)) cursorMysql.execute("SELECT * FROM (namen_table) WHERE name LIKE '%s' LIMIT 1;" % (word)) name_content = cursorMysql.fetchone() # print word +" = WORT und NAMENHIT =", name_content # print "\n" # exit(0) skip = 0 # es wurde ein namen gefunden -> kein synonym austauschen # print "Skip Name ID pre: " , skip # print "
" if name_content is None: # skip = sphinx_result['result']['items'][0].values()[0] # print word + " >>>> Skip Name ID nachdem gucken ob NamensDB Match: " , skip # print "
" # es wurde KEIN namen gefunden -> synonym austauschen # print "(YES) Skip Name ID Wir können Synonym Match Starten: " , skip # print "
" search_query_syn = Search(indexes=['onetipp_syn_simple'], config=SphinxitConfig) search_query_syn = search_query_syn.match(word).options( ranker='proximity_bm25', max_matches=1, max_query_time=350, field_weights={'synonyms': 100}, ) sphinx_result_syn = search_query_syn.ask() # pp.pprint(sphinx_result_syn) # http://stackoverflow.com/questions/7971618/python-return-first-n-keyvalue-pairs-from-dict # print "es wurde kein name gefunden: " synID = 0 try: synID = sphinx_result_syn['result']['items'][0].values()[0] if synID > 0: # print "SynDB has been found: ", synID sql = "SELECT synonyms FROM (synonym_unique_simple) WHERE uid= %s" % (synID) cursorMysql.execute(sql) syn_content = cursorMysql.fetchone() if syn_content: synwords = syn_content[0].split(";") # print SynDictCalculator(synwords) # http://www.saltycrane.com/blog/2007/09/how-to-sort-python-dictionary-by-keys/ # for key, value in sorted(mydict.iteritems(), key=lambda (k,v): (v,k)): # print "%s: %s" % (key, value) synDict = {} for wSyn in synwords: #synDict[SynRanker(wSyn, word)] = wSyn synDict[wSyn] = SynRanker(wSyn, word) sortedSynList = [] sortedSynList = sorted(synDict.items(), key=lambda x: x[1], reverse=True) #print(sortedSynList) #print("
Best Key: ", type(sortedSynList[0])) #print("
Best Value: ", sortedSynList[0][0]) # later: Randomly choose one of the synonyms that have all the highest rating # if first char of syn is uppercase than take it # http://www.tutorialspoint.com/python/python_basic_operators.htm for cSyn in synwords: if len(cSyn) < 25: #print(word + " = Originalwort -<>- Synonym > " + cSyn + "
") SynRanker(cSyn, word) lstcSyn = cSyn[0:1] cSyn = deumlaut(cSyn) if lstcSyn.isupper() and lstcWord.isupper(): tokens[ count] = '' + cSyn + '' # print "BIG HIT: " + cSyn + "
" break elif lstcSyn.islower() and lstcWord.islower(): tokens[ count] = '' + cSyn + '' # print "small hit: " + cSyn + "
" break except IndexError: print else: if lstcWord.isupper(): tokens[count] = '' + deumlaut(word) + '' # print "Namen erkannt und nicht getauscht" # file schreiben outputtext = ' '.join(tokens) with codecs.open(outputfile, 'w') as f: f.write(outputtext) f.close() mysql.close() # print outputtext exit(0);