#!/usr/bin/python # -*- coding: iso-8859-15 -*- ########################### ### Autor: Sebastian Enger / M.Sc. ### Copyright: Sebastian Enger ### Licence: Commercial / OneTipp ### Version: 1.0.3 - 11-10-2015@21:53 Uhr ### Contact: sebastian.enger@gmail.com ### OneTipp Text Tool in Python ########################### ######## export PYTHON_EGG_CACHE=/tmp import os import pprint import nltk #import rocksdb # shared library kann aktuell noch nicht gelesen werden import MySQLdb # apt-get install python-mysqldb from sphinxit.core.processor import Search # http://sphinxit.readthedocs.org/en/latest/ from sphinxit.core.helpers import BaseSearchConfig import random import codecs import sys os.environ['PYTHON_EGG_CACHE'] = '/tmp' from nltk.tokenize import sent_tokenize ###python -m nltk.downloader -d /usr/share/nltk_data all ####python -m nltk.downloader all ###########nltk.download() reload(sys) sys.setdefaultencoding('latin-1') class SphinxitConfig(BaseSearchConfig): DEBUG = True WITH_META = True WITH_STATUS = True POOL_SIZE = 5 # SQL_ENGINE = 'oursql' SEARCHD_CONNECTION = { 'host': '127.0.0.1', 'port': 9977, } pp = pprint.PrettyPrinter(indent=4) #delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r'] # http://pyrocksdb.readthedocs.org/en/v0.4/tutorial/index.html # https://github.com/sphinxsearch/sphinx/blob/master/api/sphinxapi.py # http://www.tutorialspoint.com/python/python_database_access.htm #mysql = MySQLdb.connect("localhost","root","###########99","onetipp" ) # last working sphinx = MySQLdb.connect( host = '127.0.0.1', user = 'root', passwd = '###########99', db = 'onetipp', port = 9977) # sphinxQL cursorSphinx = sphinx.cursor() mysql = MySQLdb.connect( host = '127.0.0.1', user = 'root', passwd = '###########99', db = 'onetipp', port = 3306) # Mysql cursorMysql = mysql.cursor() inputfile = sys.argv[1] outputfile = sys.argv[2] # http://www.tutorialspoint.com/python/python_command_line_arguments.htm # read file into string text = open(inputfile, 'r').read() text.decode('latin-1') #sent_tokenize_list = sent_tokenize(text) tokens = nltk.word_tokenize(text) #pp.pprint(tokens) count = -1 skip = 0 for word in tokens: count += 1 if word.istitle(): # 1. check if NamensDB eintrag -> y: write protect this entry # 2. check if Synonym_Unique -> y: take syononmy rand[0-4] -> 4 if > then 4 synonyms search_query = Search(indexes=['onetipp_name'], config=SphinxitConfig) search_query = search_query.match(word).options( ranker='proximity_bm25', max_matches=1, field_weights={'name': 100}, ) sphinx_result = search_query.ask() # pp.pprint(sphinx_result) if 'result' in sphinx_result: if 'items' in sphinx_result['result']: skip = 0 for ele in sphinx_result['result']['items']: skip = ele['id'] # print "NameDB has been found: ", skip # es wurde ein namen gefunden -> kein synonym austauschen if skip>0: search_query_syn = Search(indexes=['onetipp_syn'], config=SphinxitConfig) search_query_syn = search_query_syn.match(word).options( ranker='proximity_bm25', max_matches=1, field_weights={'synonyms': 100}, ) sphinx_result_syn = search_query_syn.ask() # pp.pprint(sphinx_result_syn) if 'result' in sphinx_result_syn: if 'items' in sphinx_result_syn['result']: for eleSP in sphinx_result_syn['result']['items']: synID = eleSP['id'] # print "SynDB has been found: ", synID sql = "SELECT synonyms FROM (synonym_unique) WHERE uid= %s" % (synID) cursorMysql.execute(sql) syn_content = cursorMysql.fetchone() # pp.pprint(syn_content) synwords = syn_content[0].split(";") # tokens[count] = '' +random.choice(synwords)+"" tokens[count] = '' +synwords[0]+"" continue """ search_query_syn2 = Search(indexes=['onetipp_syn'], config=SphinxitConfig) search_query_syn2 = search_query_syn2.match(word).options( ranker='proximity_bm25', max_matches=1, field_weights={'synonyms': 100}, ) sphinx_result_syn2 = search_query_syn2.ask() # pp.pprint(sphinx_result_syn) if 'result' in sphinx_result_syn2: if 'items' in sphinx_result_syn2['result']: for eleSP2 in sphinx_result_syn2['result']['items']: synID2 = eleSP2['id'] # print "SynDB has been found: ", synID sql2 = "SELECT synonyms FROM (synonym_unique) WHERE uid= %s" % (synID2) cursorMysql.execute(sql2) syn_content2 = cursorMysql.fetchone() #pp.pprint(syn_content2) #exit synwords2 = syn_content2[0].split(";") tokens[count] = '' +random.choice(synwords2)+ '' """ # file schreiben outputtext = ' '.join(tokens) with codecs.open(outputfile,'w') as f: f.write(outputtext) f.close() mysql.close() #print outputtext exit(0);