# -*- coding: utf-8 -*-
#!/usr/bin/python2.7 -S
# python -m spacy.en.download python -m spacy.de.download
# https://spacy.io/docs/#tutorials
# CSS: http://codepen.io/explosion/pen/xEpgKz
# CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer
import time
start_time = time.time()
# pip install --upgrade thinc
# pip3 install suds-jurko
import spacy
#import base64
import json
#import site
import codecs
import locale
import shelve
import gensim, logging
import pprint
import os.path
#import sphinxapi as SP
from sphinxapi import *
# pip install --upgrade mysql-python
import MySQLdb
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# pip install --upgrade spacy
# pip3 install --upgrade spacy
# apt-get install python3-pip
# pip install --upgrade pip
# pip3 install --upgrade pip3
# pip install -U textblob
# python -m textblob.download_corpora
# pip install --upgrade langdetect
# pip install -U textblob-de
# python -m textblob.download_corpora
#from textblob_de import TextBlobDE as TextBlob
from langdetect import detect
#from textblob_de import TextBlobDE
#from textblob import TextBlob
from libleipzig import *
# https://github.com/palday/libleipzig-python
# pip install --upgrade pattern
import sys, time # import sys package, if not already imported
reload(sys)
sys.setdefaultencoding('utf-8')
#sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout)
synonym_cache = '/home/100biere/cache/synonym_cache.shelve'
def get_synonyms(search):
docids = []
resList = []
q = '@word '+search
#q = '@synonyms Haus'
mode = SPH_MATCH_EXTENDED
host = 'localhost'
port = 9312
index = 'onetopp'
filtercol = 'group_id'
filtervals = []
sortby = '@relevance desc'
groupby = ''
groupsort = '@group desc'
limit = 5 # muss mindestens 1 sein
weight = {'words': [100], 'synonyms': [65]}
cl = SphinxClient()
#cl.SetServer ( host, port )
cl.SetServer("/var/run/searchd.sock", 0)
cl.SetConnectTimeout (7.3);
cl.SetMatchMode (mode)
###cl.SetFieldWeights( weight ) # klappt noch nicht
#cl.SetFilter( filtercol, filtervals )
cl.SetGroupBy(groupby, SPH_GROUPBY_ATTR, groupsort)
cl.SetSortMode(SPH_SORT_EXTENDED, sortby)
cl.SetLimits(0, limit, max(limit,10))
cl.SetRankingMode(SPH_RANK_BM25);
# do query
res = ""
try:
res = cl.Query( q, index )
except Exception:
print('query failed: %s' % cl.GetLastError())
#sys.exit(1)
return res;
if cl.GetLastWarning():
print('WARNING: %s\n' % cl.GetLastWarning())
# print('Query \'%s\' retrieved %d of %d matches in %s sec' % (q, res['total'], res['total_found'], res['time']))
# print('Query stats:')
# if 'words' in res:
# for info in res['words']:
# print('\t\'%s\' found %d times in %d documents' % (info['word'], info['hits'], info['docs']))
if 'matches' in res:
n = 1
# print('\nMatches:')
for match in res['matches']:
# attrsdump = ''
# for attr in res['attrs']:
# attrname = attr[0]
# attrtype = attr[1]
# value = match['attrs'][attrname]
# if attrtype==SPH_ATTR_TIMESTAMP:
# value = time.strftime ( '%Y-%m-%d %H:%M:%S', time.localtime(value) )
# attrsdump = '%s, %s=%s' % ( attrsdump, attrname, value )
# print('%d. doc_id=%s, weight=%d%s' % (n, match['id'], match['weight'], attrsdump))
docids.append(match['id'])
n += 1
#pprint.pprint(docids)
SqlQuery = "SELECT word, synonyms FROM synonym_unique WHERE";
for i, val in enumerate(docids):
sql_str = str(val)
if i>0:
#SqlQuery += "\""+sql_str+"\","
SqlQuery += sql_str+","
elif i<=0:
SqlQuery += " `uid` IN (\""+sql_str+"\","
SqlQuery = SqlQuery[:-1]
SqlQuery += ")"
#pprint.pprint("Das ist mein SQL: "+SqlQuery)
#SqlQuery = substr($SqlQuery,0,(strlen($SqlQuery)-1));
# Open database connection
# https://www.tutorialspoint.com/python/python_database_access.htm
if len(docids)>=1:
#pprint.pprint("Das ist mein SQL: "+SqlQuery)
try:
dbs = MySQLdb.connect(user="root",
passwd="###########99",
host="127.0.0.1",
unix_socket="/var/run/mysqld/mysqld.sock",
port=3306,
db="onetopp")
# Execute the SQL command
cursor = dbs.cursor()
cursor.execute(SqlQuery)
dbs.commit()
# Fetch all the rows in a list of lists.
results = cursor.fetchall()
#pprint.pprint(results)
#sys.exit(0)
# disconnect from server
cursor.close()
dbs.close()
for row in results:
word = row[0]
synonyms = row[1]
w = synonyms.split(";")
resList.append(w)
# Now print fetched result
# print "HITS: word=%s,synonyms=%s" % (word, synonyms )
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0],e.args[1])
print(cursor._last_executed)
print "Error: unable to fetch data"
#for row in cursor:
# print(row)
#my_list = set(resList) # make list unique
unique = reduce(lambda l, x: l.append(x) or l if x not in l else l, resList, [])
return unique
def read_similarity(word, sentences):
wv = 0.00000
if word and len(word)>=2 and len(sentences)>=2:
path = "/home/100biere/pretrained/"+word+".w2v.bin"
#print("Working on: "+word)
# eventuell später: "similar_by_word(word, topn=10, restrict_vocab=None)" aus https://radimrehurek.com/gensim/models/word2vec.html verwenden, es gibt genau das passende wort dazu aus
# eventuell dazu die gesamte Synonym SQL Datenbank mal einmal mit WordVec trainieren und dann nur noch auf das Modell zugreifen, wenn benötigt
if os.path.isfile(path):
print ("Trained Gensim Model for Word found: "+word)
model = gensim.models.Word2Vec.load(path)
sim = model.most_similar(positive=[word], negative=[], topn=1)#model.similar_by_word(word)
pprint.pprint(sim)
try:
sim = model.most_similar(positive=[word], negative=[], topn=1)
v = sim[0]
vv = v[1]
return vv
except KeyError:
1
else: # do the training
print ("Doing GENSIM Training -> NO Model for Word found: "+word)
model = gensim.models.Word2Vec(sentences, sg=1, hs=1, iter=50, size=400, workers=1, sorted_vocab=1, alpha=0.325, min_count=1)
model.init_sims(replace=False) # can read,write from, and also training -> more memory
#model.init_sims(replace=True) # can only read from, but no more training -> less memory
model.save(path)
try:
sim = model.most_similar(positive=[word], negative=[], topn=1)
v = sim[0]
vv = v[1]
return vv
except KeyError:
1
return wv
def get_key(key):
try:
return int(key)
except ValueError:
return key
def exchange_word(o):
j = str(o)
if len(j) <= 1: # Wenn Original Wort Länge mehr als 2 Zeichen ist
return o
ww = unicode(j, "utf-8")
#r = uni_leipzig(ww)
r = get_synonyms(ww)
#pprint.pprint(ww)
#pprint.pprint(r)
#sys.exit(0)
if not r:
return j
#pprint.pprint(r)
words = r #r.split(",")
#print ("Exchange Word: exchange_word(o)")
n = {}
nn = ""
for mm in words:
m = mm[0]
# print(type(m))
# pprint.pprint(m)
# my_a = m #m[m.find('"')+len('"'):m.rfind('"')]
# my_b = m #unicode(my_a, "utf-8")
if len(m) >= 2: # Wenn Synonym Länge mehr als 2 Zeichen ist
# print(type(m))
# print(m)
rs = read_similarity(m, r)
# print("for m in words: Similarity:"),
# print(rs)
n[m] = rs
#print(type(my_b))
#print("String: "+my_b)
#sys.exit(0)
#print(type(words))
#print("no schrottinger")
#sys.exit(1)
# for i, val in enumerate(r):
# print(val)
# w = val[0]
# if len(w) >= 2: # Wenn Synonym Länge mehr als 2 Zeichen ist
# rs = read_similarity(w, r)
# n[w] = rs
#
# sys.exit(1)
###print ("Synonyme mit Similarity Score:")
###pprint.pprint(n)
if len(n) >= 1:
# hole das Element aus dem Dict, das den höhsten Similarity Score hat
nn = max(n, key=lambda key: n[key])
# wenn das Originalwort und das Synonym gleich sind, dann lösche es aus dem Dict und teste neu
#nn = unicode(nn, "utf-8")
nn = nn.decode("utf-8", "ignore")
if nn == ww:
print("Originalwort und das Synonym gleich sind -> NEU")
del n[nn]
if len(n) > 0:
nn = max(n, key=lambda key: n[key])
nn = unicode(nn, "utf-8")
rsr = read_similarity(nn, r)
### if rsr <= 0.011:
### return ww
print("\tOriginal: "+ww+" -> Synonym: "+nn)
print("\tSynonym Similarity Score:"),
print(rsr)
#print("\tAlle Synonyme mit Similarity Score:")
#pprint.pprint(n)
return nn
#oDoc = (u'''Auf der Autobahn fahren viele Autos mit hoher Geschwindigkeit.''')
#oDoc = (u"Der Geschäftsführer ist im Meeting und fragt nach der Telefonnummer von seiner schönen Frau. Sie ist gerade mit ihren kleinen Kindern im Kindergarten und trifft danach ihre Freundinnen im Cafe.")
with codecs.open("/home/100biere/input.txt",'r',encoding='utf8') as f:
oDoc = f.read()
lang = detect(oDoc)
print ("Language detected: "+lang)
print ("Text: "+oDoc)
if lang == "en":
de_nlp = spacy.load('en', tagger=True, parser=False, entity=False)
from textblob import TextBlob
from textblob import Word
from pattern import singularize, pluralize
elif lang == "de":
de_nlp = spacy.load('de', tagger=True, parser=False, entity=False)
from textblob_de import TextBlobDE as TextBlob
from textblob_de import Word
from pattern.de import singularize, pluralize
else:
de_nlp = spacy.load('de', tagger=True, parser=False, entity=False)
from textblob_de import TextBlobDE as TextBlob
from textblob_de import Word
from pattern.de import singularize, pluralize
print ("Tagging & Tokenizing Text")
oDoc_nlp = de_nlp(oDoc, tag=True, parse=False)
nDoc = ""
Tokens = [o for o in oDoc_nlp]
noun_max_change = 3
adj_max_change = 2
verb_max_change = 2
adj_count = 0
noun_count = 0
verb_count = 0
'''
# http://www.clips.ua.ac.be/pages/pattern-de
word = "Katze"
print singularize(word)
print pluralize(word)
sys.exit(0)
pprint.pprint(Tokens)
sys.exit(0)
'''
for o in Tokens:
j = str(o)
new_word = unicode(j, "utf-8")
if (o.pos_ == "NOUN"):
if noun_count == noun_max_change:
nDoc += " "+new_word
else:
ew = exchange_word(new_word)
nDoc += " "+''+ew+''
noun_count += 1
elif (o.pos_ == "VERB"):
if verb_count == verb_max_change:
nDoc += " "+new_word
else:
ew = exchange_word(new_word)
nDoc += " "+''+ew+''
verb_count += 1
elif (o.pos_ == "ADJ"):
if adj_count == adj_max_change:
nDoc += " "+new_word
else:
ew = exchange_word(new_word)
nDoc += " "+''+ew+''
adj_count += 1
else:
nDoc += " "+new_word
print("Originalsatz: " + oDoc)
print("Ausgabesatz: " + nDoc)
# process Unicode text
with codecs.open("/home/100biere/output.txt",'w',encoding='utf8') as f:
f.write(nDoc)
print("Script Runtime: --- %s seconds ---" % (time.time() - start_time))
sys.exit(0)
def uni_leipzig(o):
mo = str(o).lower()
ow = str(o)
db = shelve.open(synonym_cache)
try:
#print ("Read Cache: ")
t = db[mo] # retrieve cache
data = json.loads(t)
db.close()
# cache hit
if data:
print ("uni_leipzig() Cache hit for Entry: "+mo)
#pprint.pprint(data)
calc_similarity(mo, data)
return data
except Exception:
1
for x in range(0, 3):
try:
#r = Cooccurrences("warmen", 5, 10)
uo = unicode(ow, "utf-8")
print(x),
print(" -> WebCall for Wort:"),
print(uo)
# mysql -u root -p openthesaurus < openthesaurus_dump.sql
#SELECT word FROM `term` WHERE `synset_id` = "1" AND `language_id` = "2"
# https://github.com/arbox/wlapi/blob/master/test/fixtures/vcr_cassettes/synonyms.yml
r = Thesaurus(ow, 20)
pprint.pprint(r)
# if we have a full list of Thesaurus Elements then we are happy
if r:
b1 = json.dumps(r)
db = shelve.open(synonym_cache)
db[mo] = b1 # store cache
db.close()
calc_similarity(mo, b1)
print ("uni_leipzig() Cache Write for Entry: "+mo)
pprint.pprint(b1)
return b1
#except Exception as e:
except Exception as e:
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
print("Error while doing WebCall to Uni Leipzig")
continue
return False
# http://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
# https://www.quora.com/Is-skip-gram-negative-sampling-better-than-CBOW-NS-for-word2vec-If-so-why
def calc_similarity(word, sentences):
if word and len(word)>=2 and len(sentences)>=2:
path = "/home/100biere/pretrained/"+word+".w2v.bin"
# if file is existing, we have a trained model and can savely return
if os.path.isfile(path):
print ("Calculating Word2Vec for Input Word: "+ word)
model = gensim.models.Word2Vec.load(path)
model2 = gensim.models.Word2Vec(model, sg=1, hs=1, iter=400, size=400, workers=1, sorted_vocab=1, alpha=0.325, min_count=1)
model2.init_sims(replace=False) # can read,write from, and also training -> more memory
#model.init_sims(replace=True) # can only read from, but no more training -> less memory
model2.save(path)
else:
model2 = gensim.models.Word2Vec(path, sg=1, hs=1, iter=400, size=400, workers=1, sorted_vocab=1, alpha=0.325, min_count=1)
model2.init_sims(replace=False) # can read,write from, and also training -> more memory
#model.init_sims(replace=True) # can only read from, but no more training -> less memory
model2.save(path)
return 1