#!/usr/bin/python
# -*- coding: iso-8859-15 -*-
###########################
### Autor: Sebastian Enger / M.Sc.
### Copyright: Sebastian Enger
### Licence: Commercial / OneTipp
### Version: 1.0.3 - 11-10-2015@21:53 Uhr
### Contact: sebastian.enger@gmail.com
### OneTipp Text Tool in Python
###########################
######## export PYTHON_EGG_CACHE=/tmp
import os
import pprint
import nltk
#import rocksdb # shared library kann aktuell noch nicht gelesen werden
import MySQLdb # apt-get install python-mysqldb
from sphinxit.core.processor import Search # http://sphinxit.readthedocs.org/en/latest/
from sphinxit.core.helpers import BaseSearchConfig
import random
import codecs
import sys
os.environ['PYTHON_EGG_CACHE'] = '/tmp'
from nltk.tokenize import sent_tokenize
###python -m nltk.downloader -d /usr/share/nltk_data all
####python -m nltk.downloader all
###########nltk.download()
reload(sys)
sys.setdefaultencoding('latin-1')
class SphinxitConfig(BaseSearchConfig):
DEBUG = True
WITH_META = True
WITH_STATUS = True
POOL_SIZE = 5
# SQL_ENGINE = 'oursql'
SEARCHD_CONNECTION = {
'host': '127.0.0.1',
'port': 9977,
}
pp = pprint.PrettyPrinter(indent=4)
#delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r']
# http://pyrocksdb.readthedocs.org/en/v0.4/tutorial/index.html
# https://github.com/sphinxsearch/sphinx/blob/master/api/sphinxapi.py
# http://www.tutorialspoint.com/python/python_database_access.htm
#mysql = MySQLdb.connect("localhost","root","###########99","onetipp" ) # last working
sphinx = MySQLdb.connect(
host = '127.0.0.1',
user = 'root',
passwd = '###########99',
db = 'onetipp',
port = 9977) # sphinxQL
cursorSphinx = sphinx.cursor()
mysql = MySQLdb.connect(
host = '127.0.0.1',
user = 'root',
passwd = '###########99',
db = 'onetipp',
port = 3306) # Mysql
cursorMysql = mysql.cursor()
inputfile = sys.argv[1]
outputfile = sys.argv[2]
# http://www.tutorialspoint.com/python/python_command_line_arguments.htm
# read file into string
text = open(inputfile, 'r').read()
text.decode('latin-1')
#sent_tokenize_list = sent_tokenize(text)
tokens = nltk.word_tokenize(text)
#pp.pprint(tokens)
count = -1
skip = 0
for word in tokens:
count += 1
if word.istitle():
# 1. check if NamensDB eintrag -> y: write protect this entry
# 2. check if Synonym_Unique -> y: take syononmy rand[0-4] -> 4 if > then 4 synonyms
search_query = Search(indexes=['onetipp_name'], config=SphinxitConfig)
search_query = search_query.match(word).options(
ranker='proximity_bm25',
max_matches=1,
field_weights={'name': 100},
)
sphinx_result = search_query.ask()
# pp.pprint(sphinx_result)
if 'result' in sphinx_result:
if 'items' in sphinx_result['result']:
skip = 0
for ele in sphinx_result['result']['items']:
skip = ele['id']
# print "NameDB has been found: ", skip
# es wurde ein namen gefunden -> kein synonym austauschen
if skip>0:
search_query_syn = Search(indexes=['onetipp_syn'], config=SphinxitConfig)
search_query_syn = search_query_syn.match(word).options(
ranker='proximity_bm25',
max_matches=1,
field_weights={'synonyms': 100},
)
sphinx_result_syn = search_query_syn.ask()
# pp.pprint(sphinx_result_syn)
if 'result' in sphinx_result_syn:
if 'items' in sphinx_result_syn['result']:
for eleSP in sphinx_result_syn['result']['items']:
synID = eleSP['id']
# print "SynDB has been found: ", synID
sql = "SELECT synonyms FROM (synonym_unique) WHERE uid= %s" % (synID)
cursorMysql.execute(sql)
syn_content = cursorMysql.fetchone()
# pp.pprint(syn_content)
synwords = syn_content[0].split(";")
# tokens[count] = '' +random.choice(synwords)+""
tokens[count] = '' +synwords[0]+""
continue
"""
search_query_syn2 = Search(indexes=['onetipp_syn'], config=SphinxitConfig)
search_query_syn2 = search_query_syn2.match(word).options(
ranker='proximity_bm25',
max_matches=1,
field_weights={'synonyms': 100},
)
sphinx_result_syn2 = search_query_syn2.ask()
# pp.pprint(sphinx_result_syn)
if 'result' in sphinx_result_syn2:
if 'items' in sphinx_result_syn2['result']:
for eleSP2 in sphinx_result_syn2['result']['items']:
synID2 = eleSP2['id']
# print "SynDB has been found: ", synID
sql2 = "SELECT synonyms FROM (synonym_unique) WHERE uid= %s" % (synID2)
cursorMysql.execute(sql2)
syn_content2 = cursorMysql.fetchone()
#pp.pprint(syn_content2)
#exit
synwords2 = syn_content2[0].split(";")
tokens[count] = '' +random.choice(synwords2)+ ''
"""
# file schreiben
outputtext = ' '.join(tokens)
with codecs.open(outputfile,'w') as f:
f.write(outputtext)
f.close()
mysql.close()
#print outputtext
exit(0);