# -*- coding: utf-8 -*-
#!/usr/bin/python2.7 -S
# python -m spacy.en.download python -m spacy.de.download
# https://spacy.io/docs/#tutorials
# CSS: http://codepen.io/explosion/pen/xEpgKz
# CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer
import time
start_time = time.time()
# pip install --upgrade thinc
# pip3 install suds-jurko
import spacy
#import base64
import json
#import site
import codecs
import locale
import shelve
import gensim, logging
import pprint
import os.path
#import sphinxapi as SP
from sphinxapi import *
# pip install --upgrade mysql-python
import MySQLdb
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# pip3 install --upgrade spacy
# apt-get install python3-pip
# pip install --upgrade pip
# pip3 install --upgrade pip3
# pip install -U textblob
# python -m textblob.download_corpora
# pip install --upgrade langdetect
# pip install -U textblob-de
# python -m textblob.download_corpora
#from textblob_de import TextBlobDE as TextBlob
from langdetect import detect
#from textblob_de import TextBlobDE
#from textblob import TextBlob
from libleipzig import * # pip install --upgrade libleipzig
# pip install --upgrade libleipzig
# pip3 install --upgrade libleipzig
# pip install --upgrade pattern
import sys, time # import sys package, if not already imported
reload(sys)
sys.setdefaultencoding('utf-8')
#sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout)
def get_synonyms():
docids = []
q = '@word Haus'
#q = '@synonyms Haus'
mode = SPH_MATCH_EXTENDED
host = 'localhost'
port = 9312
index = 'onetopp'
filtercol = 'group_id'
filtervals = []
sortby = '@relevance desc'
groupby = ''
groupsort = '@group desc'
limit = 5 # muss mindestens 1 sein
weight = {'words': [100], 'synonyms': [65]}
cl = SphinxClient()
#cl.SetServer ( host, port )
cl.SetServer( "/var/run/searchd.sock", 0 )
cl.SetConnectTimeout ( 7.3 );
cl.SetMatchMode ( mode )
###cl.SetFieldWeights( weight ) # klappt noch nicht
#cl.SetFilter( filtercol, filtervals )
cl.SetGroupBy( groupby, SPH_GROUPBY_ATTR, groupsort )
cl.SetSortMode( SPH_SORT_EXTENDED, sortby )
cl.SetLimits( 0, limit, max(limit,10) )
cl.SetRankingMode( SPH_RANK_BM25 );
# do query
res = ""
try:
res = cl.Query( q, index )
except Exception:
print('query failed: %s' % cl.GetLastError())
sys.exit(1)
if cl.GetLastWarning():
print('WARNING: %s\n' % cl.GetLastWarning())
print('Query \'%s\' retrieved %d of %d matches in %s sec' % (q, res['total'], res['total_found'], res['time']))
print('Query stats:')
if 'words' in res:
for info in res['words']:
print('\t\'%s\' found %d times in %d documents' % (info['word'], info['hits'], info['docs']))
if 'matches' in res:
n = 1
print('\nMatches:')
for match in res['matches']:
attrsdump = ''
for attr in res['attrs']:
attrname = attr[0]
attrtype = attr[1]
value = match['attrs'][attrname]
if attrtype==SPH_ATTR_TIMESTAMP:
value = time.strftime ( '%Y-%m-%d %H:%M:%S', time.localtime(value) )
attrsdump = '%s, %s=%s' % ( attrsdump, attrname, value )
print('%d. doc_id=%s, weight=%d%s' % (n, match['id'], match['weight'], attrsdump))
docids.append(match['id'])
n += 1
#pprint.pprint(docids)
SqlQuery = "SELECT word, synonyms FROM synonym_unique WHERE";
for i, val in enumerate(docids):
sql_str = str(val)
if i>0:
SqlQuery += "\""+sql_str+"\","
elif i<=0:
SqlQuery += " `uid` IN (\""+sql_str+"\","
SqlQuery = SqlQuery[:-1]
SqlQuery += ")"
#pprint.pprint(SqlQuery)
#SqlQuery = substr($SqlQuery,0,(strlen($SqlQuery)-1));
# Open database connection
# https://www.tutorialspoint.com/python/python_database_access.htm
try:
dbs = MySQLdb.connect(user="root",
passwd="###########99",
host="127.0.0.1",
unix_socket="/var/run/mysqld/mysqld.sock",
port=3306,
db="onetopp")
# Execute the SQL command
cursor = dbs.cursor()
cursor.execute(SqlQuery)
dbs.commit()
# Fetch all the rows in a list of lists.
results = cursor.fetchall()
pprint.pprint(results)
sys.exit(0)
for row in results:
word = row[0]
synonyms = row[1]
# Now print fetched result
print "HITS: word=%s,synonyms=%s" % (word, synonyms )
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0],e.args[1])
print(cursor._last_executed)
print "Error: unable to fetch data"
#for row in cursor:
# print(row)
# disconnect from server
cursor.close()
dbs.close()
#sys.exit(0)
return 1
# http://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
# https://www.quora.com/Is-skip-gram-negative-sampling-better-than-CBOW-NS-for-word2vec-If-so-why
def calc_similarity(word, sentences):
path = "/home/100biere/model/"+word+".bin"
# if file is existing, we have a trained model and can savely return
if os.path.isfile(path):
return 1
###print ("Calculating Word2Vec for Input Word: "+ word)
model = gensim.models.Word2Vec(sentences, min_count=1, iter=1200, size=500, workers=4, sorted_vocab=1, batch_words=300, alpha=0.025)
model.init_sims(replace=True)
model.save(path)
return 1
def read_similarity(word, sentences):
path = "/home/100biere/model/"+word+".bin"
wv = 0.0
if os.path.isfile(path):
model = gensim.models.Word2Vec.load(path)
sim = model.most_similar(positive=[word], negative=[], topn=1, restrict_vocab=200)
v = sim[0]
wv = v[1]
else:
model = gensim.models.Word2Vec(sentences, min_count=1, iter=1200, size=500, workers=4, sorted_vocab=1)
model.init_sims(replace=True)
model.save(path)
sim = model.most_similar(positive=[word], negative=[], topn=1, restrict_vocab=200)
v = sim[0]
wv = v[1]
return wv
def get_key(key):
try:
return int(key)
except ValueError:
return key
def uni_leipzig(l, o):
mo = str(o).lower()
ow = str(o)
db = shelve.open('/home/100biere/cache/synonym_cache.shelve')
data = ""
try:
#print ("Read Cache: ")
t = db[mo] # retrieve cache
data = json.loads(t)
#pprint.pprint(data)
except Exception:
1
db.close()
# cache hit
if data:
print ("uni_leipzig() Cache hit for Entry: "+mo)
calc_similarity(mo, data)
return data
for x in range(0, 4):
try:
#r = Cooccurrences("warmen", 5, 10)
uo = unicode(ow, "utf-8")
print("WebCall for Wort:"),
print(uo)
if l == 0:
r = Thesaurus(ow, 150)
elif l == 1:
r = Similarity(ow, 150)
# if we have a full list of Thesaurus Elements then we are happy
if r:
break
except Exception:
continue
b1 = json.dumps(r)
db = shelve.open('/home/100biere/cache/synonym_cache.shelve')
db[mo] = b1 # store cache
###print ("uni_leipzig() Cache Write for Entry: "+mo)
#pprint.pprint(b1)
db.close()
calc_similarity(mo, b1)
#data = json.loads(array)
return b1
def exchange_word(o):
j = str(o)
ww = unicode(j, "utf-8")
r = uni_leipzig(0, ww)
#print ("Exchange Word: exchange_word(o)")
#pprint.pprint(r)
n = {}
for i, val in enumerate(r):
w = val[0]
rs = read_similarity(w, r)
n[w] = rs
###print ("Synonyme mit Similarity Score:")
###pprint.pprint(n)
# hole das Element aus dem Dict, das den höhsten Similarity Score hat
nn = max(n, key=lambda key: n[key])
# wenn das Originalwort und das Synonym gleich sind, dann lösche es aus dem Dict und teste neu
if nn == ww:
#print("Originalwort und das Synonym gleich sind -> NEU")
del n[nn]
nn = max(n, key=lambda key: n[key])
rsr = read_similarity(nn, r)
if rsr <= 0.011:
return ww
###print("\tOriginal: "+ww+" -> Synonym: "+nn)
###print("\tSynonym Similarity Score:"),
###print(rsr)
#print("\tAlle Synonyme mit Similarity Score:")
#pprint.pprint(n)
return nn
#oDoc = (u'''Auf der Autobahn fahren viele Autos mit hoher Geschwindigkeit.''')
#oDoc = (u"Der Geschäftsführer ist im Meeting und fragt nach der Telefonnummer von seiner schönen Frau. Sie ist gerade mit ihren kleinen Kindern im Kindergarten und trifft danach ihre Freundinnen im Cafe.")
with codecs.open("/home/www/www.onetopp.com/100biere/input.txt",'r',encoding='utf8') as f:
oDoc = f.read()
lang = detect(oDoc)
print ("Language detected: "+lang)
if lang == "en":
de_nlp = spacy.load('en', tagger=True, parser=False, entity=False)
from textblob import TextBlob
from textblob import Word
from pattern import singularize, pluralize
elif lang == "de":
de_nlp = spacy.load('de', tagger=True, parser=False, entity=False)
from textblob_de import TextBlobDE as TextBlob
from textblob_de import Word
from pattern.de import singularize, pluralize
else:
de_nlp = spacy.load('de', tagger=True, parser=False, entity=False)
from textblob_de import TextBlobDE as TextBlob
from textblob_de import Word
from pattern.de import singularize, pluralize
oDoc_nlp = de_nlp(oDoc, tag=True, parse=False)
nDoc = ""
Tokens = [o for o in oDoc_nlp]
noun_max_change = 3
adj_max_change = 1
verb_max_change = 2
adj_count = 0
noun_count = 0
verb_count = 0
'''
# http://www.clips.ua.ac.be/pages/pattern-de
word = "Katze"
print singularize(word)
print pluralize(word)
sys.exit(0)
pprint.pprint(Tokens)
sys.exit(0)
'''
for o in Tokens:
j = str(o)
new_word = unicode(j, "utf-8")
if (o.pos_ == "NOUN"):
if noun_count == noun_max_change:
nDoc += " "+new_word
else:
ew = exchange_word(new_word)
nDoc += " "+''+ew+''
noun_count += 1
elif (o.pos_ == "VERB"):
if verb_count == verb_max_change:
nDoc += " "+new_word
else:
#print ("Verb zum Austausch gefunden\n")
# new_word = ''+w+''
ew = exchange_word(new_word)
nDoc += " "+''+ew+''
verb_count += 1
elif (o.pos_ == "ADJ"):
if adj_count == adj_max_change:
nDoc += " "+new_word
else:
ew = exchange_word(new_word)
nDoc += " "+''+ew+''
adj_count += 1
else:
nDoc += " "+new_word
print("Originalsatz: " + oDoc)
print("Ausgabesatz: " + nDoc)
# process Unicode text
with codecs.open("/home/www/www.onetopp.com/100biere/output.txt",'w',encoding='utf8') as f:
f.write(nDoc)
print("Script Runtime: --- %s seconds ---" % (time.time() - start_time))
sys.exit(0)