# -*- coding: utf-8 -*- #!/usr/bin/python2.7 -S # python -m spacy.en.download python -m spacy.de.download # https://spacy.io/docs/#tutorials # CSS: http://codepen.io/explosion/pen/xEpgKz # CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer import time start_time = time.time() # pip install --upgrade thinc # pip3 install suds-jurko import spacy #import base64 import json #import site import codecs import locale import shelve import gensim, logging import pprint import os.path #import sphinxapi as SP from sphinxapi import * # pip install --upgrade mysql-python import MySQLdb # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # pip3 install --upgrade spacy # apt-get install python3-pip # pip install --upgrade pip # pip3 install --upgrade pip3 # pip install -U textblob # python -m textblob.download_corpora # pip install --upgrade langdetect # pip install -U textblob-de # python -m textblob.download_corpora #from textblob_de import TextBlobDE as TextBlob from langdetect import detect #from textblob_de import TextBlobDE #from textblob import TextBlob from libleipzig import * # pip install --upgrade libleipzig # pip install --upgrade libleipzig # pip3 install --upgrade libleipzig # pip install --upgrade pattern import sys, time # import sys package, if not already imported reload(sys) sys.setdefaultencoding('utf-8') #sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout) def get_synonyms(): docids = [] q = '@word Haus' #q = '@synonyms Haus' mode = SPH_MATCH_EXTENDED host = 'localhost' port = 9312 index = 'onetopp' filtercol = 'group_id' filtervals = [] sortby = '@relevance desc' groupby = '' groupsort = '@group desc' limit = 5 # muss mindestens 1 sein weight = {'words': [100], 'synonyms': [65]} cl = SphinxClient() #cl.SetServer ( host, port ) cl.SetServer( "/var/run/searchd.sock", 0 ) cl.SetConnectTimeout ( 7.3 ); cl.SetMatchMode ( mode ) ###cl.SetFieldWeights( weight ) # klappt noch nicht #cl.SetFilter( filtercol, filtervals ) cl.SetGroupBy( groupby, SPH_GROUPBY_ATTR, groupsort ) cl.SetSortMode( SPH_SORT_EXTENDED, sortby ) cl.SetLimits( 0, limit, max(limit,10) ) cl.SetRankingMode( SPH_RANK_BM25 ); # do query res = "" try: res = cl.Query( q, index ) except Exception: print('query failed: %s' % cl.GetLastError()) sys.exit(1) if cl.GetLastWarning(): print('WARNING: %s\n' % cl.GetLastWarning()) print('Query \'%s\' retrieved %d of %d matches in %s sec' % (q, res['total'], res['total_found'], res['time'])) print('Query stats:') if 'words' in res: for info in res['words']: print('\t\'%s\' found %d times in %d documents' % (info['word'], info['hits'], info['docs'])) if 'matches' in res: n = 1 print('\nMatches:') for match in res['matches']: attrsdump = '' for attr in res['attrs']: attrname = attr[0] attrtype = attr[1] value = match['attrs'][attrname] if attrtype==SPH_ATTR_TIMESTAMP: value = time.strftime ( '%Y-%m-%d %H:%M:%S', time.localtime(value) ) attrsdump = '%s, %s=%s' % ( attrsdump, attrname, value ) print('%d. doc_id=%s, weight=%d%s' % (n, match['id'], match['weight'], attrsdump)) docids.append(match['id']) n += 1 #pprint.pprint(docids) SqlQuery = "SELECT word, synonyms FROM synonym_unique WHERE"; for i, val in enumerate(docids): sql_str = str(val) if i>0: SqlQuery += "\""+sql_str+"\"," elif i<=0: SqlQuery += " `uid` IN (\""+sql_str+"\"," SqlQuery = SqlQuery[:-1] SqlQuery += ")" #pprint.pprint(SqlQuery) #SqlQuery = substr($SqlQuery,0,(strlen($SqlQuery)-1)); # Open database connection # https://www.tutorialspoint.com/python/python_database_access.htm try: dbs = MySQLdb.connect(user="root", passwd="###########99", host="127.0.0.1", unix_socket="/var/run/mysqld/mysqld.sock", port=3306, db="onetopp") # Execute the SQL command cursor = dbs.cursor() cursor.execute(SqlQuery) dbs.commit() # Fetch all the rows in a list of lists. results = cursor.fetchall() pprint.pprint(results) sys.exit(0) for row in results: word = row[0] synonyms = row[1] # Now print fetched result print "HITS: word=%s,synonyms=%s" % (word, synonyms ) except MySQLdb.Error, e: print "Error %d: %s" % (e.args[0],e.args[1]) print(cursor._last_executed) print "Error: unable to fetch data" #for row in cursor: # print(row) # disconnect from server cursor.close() dbs.close() #sys.exit(0) return 1 # http://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec # https://www.quora.com/Is-skip-gram-negative-sampling-better-than-CBOW-NS-for-word2vec-If-so-why def calc_similarity(word, sentences): path = "/home/100biere/model/"+word+".bin" # if file is existing, we have a trained model and can savely return if os.path.isfile(path): return 1 ###print ("Calculating Word2Vec for Input Word: "+ word) #model = gensim.models.Word2Vec(sentences, min_count=1, iter=300, size=500, workers=4, sorted_vocab=1, batch_words=500, alpha=0.035) model = gensim.models.Word2Vec(sentences, min_count=1, iter=30, workers=4, sorted_vocab=1, alpha=0.025) model.init_sims(replace=True) model.save(path) return 1 def read_similarity(word, sentences): path = "/home/100biere/model/"+word+".bin" wv = 0.0 if os.path.isfile(path): model = gensim.models.Word2Vec.load(path) try: sim = model.most_similar(positive=[word], negative=[], topn=1, restrict_vocab=20000) v = sim[0] wv = v[1] return wv except KeyError: 1 else: model = gensim.models.Word2Vec(sentences, min_count=1, iter=1200, size=500, workers=4, sorted_vocab=1) model.init_sims(replace=True) model.save(path) sim = "" try: sim = model.most_similar(positive=[word], negative=[], topn=1, restrict_vocab=20000) v = sim[0] wv = v[1] return wv except KeyError: 1 return wv def get_key(key): try: return int(key) except ValueError: return key def uni_leipzig(l, o): mo = str(o).lower() ow = str(o) db = shelve.open('/home/100biere/cache/synonym_cache.shelve') data = "" try: print ("Read Cache: ") t = db[mo] # retrieve cache data = json.loads(t) #pprint.pprint(data) except Exception: 1 db.close() print (type(data)) # cache hit if data: print ("uni_leipzig() Cache hit for Entry: "+mo) calc_similarity(mo, data) return data for x in range(0, 1): try: #r = Cooccurrences("warmen", 5, 10) uo = unicode(ow, "utf-8") print("WebCall for Wort:"), print(uo) if l == 0: r = Thesaurus(ow, 150) elif l == 1: r = Similarity(ow, 150) # if we have a full list of Thesaurus Elements then we are happy if r: break except Exception: continue b1 = json.dumps(r) db = shelve.open('/home/100biere/cache/synonym_cache.shelve') db[mo] = b1 # store cache ###print ("uni_leipzig() Cache Write for Entry: "+mo) #pprint.pprint(b1) db.close() calc_similarity(mo, b1) #data = json.loads(array) return b1 def exchange_word(o): j = str(o) if len(j) <= 1: # Wenn Original Wort Länge mehr als 2 Zeichen ist return o ww = unicode(j, "utf-8") r = uni_leipzig(0, ww) words = r.split(",") print ("Exchange Word: exchange_word(o)") #pprint.pprint(r) n = {} nn = "" for m in words: my_a = m[m.find('"')+len('"'):m.rfind('"')] my_b = unicode(my_a, "utf-8") if len(my_a) >= 2: # Wenn Synonym Länge mehr als 2 Zeichen ist rs = read_similarity(my_a, r) n[my_a] = rs #print(type(my_b)) #print("String: "+my_b) #print(type(words)) #print("no schrottinger") #sys.exit(1) # for i, val in enumerate(r): # print(val) # w = val[0] # if len(w) >= 2: # Wenn Synonym Länge mehr als 2 Zeichen ist # rs = read_similarity(w, r) # n[w] = rs # # sys.exit(1) ###print ("Synonyme mit Similarity Score:") ###pprint.pprint(n) if len(n) > 0: # hole das Element aus dem Dict, das den höhsten Similarity Score hat nn = max(n, key=lambda key: n[key]) # wenn das Originalwort und das Synonym gleich sind, dann lösche es aus dem Dict und teste neu if nn == ww: #print("Originalwort und das Synonym gleich sind -> NEU") del n[nn] if len(n) > 0: nn = max(n, key=lambda key: n[key]) rsr = read_similarity(nn, r) ### if rsr <= 0.011: ### return ww print("\tOriginal: "+ww+" -> Synonym: "+nn) print("\tSynonym Similarity Score:"), print(rsr) #print("\tAlle Synonyme mit Similarity Score:") #pprint.pprint(n) return nn #oDoc = (u'''Auf der Autobahn fahren viele Autos mit hoher Geschwindigkeit.''') #oDoc = (u"Der Geschäftsführer ist im Meeting und fragt nach der Telefonnummer von seiner schönen Frau. Sie ist gerade mit ihren kleinen Kindern im Kindergarten und trifft danach ihre Freundinnen im Cafe.") with codecs.open("/home/www/www.onetopp.com/100biere/input.txt",'r',encoding='utf8') as f: oDoc = f.read() lang = detect(oDoc) print ("Language detected: "+lang) if lang == "en": de_nlp = spacy.load('en', tagger=True, parser=False, entity=False) from textblob import TextBlob from textblob import Word from pattern import singularize, pluralize elif lang == "de": de_nlp = spacy.load('de', tagger=True, parser=False, entity=False) from textblob_de import TextBlobDE as TextBlob from textblob_de import Word from pattern.de import singularize, pluralize else: de_nlp = spacy.load('de', tagger=True, parser=False, entity=False) from textblob_de import TextBlobDE as TextBlob from textblob_de import Word from pattern.de import singularize, pluralize oDoc_nlp = de_nlp(oDoc, tag=True, parse=False) nDoc = "" Tokens = [o for o in oDoc_nlp] noun_max_change = 3 adj_max_change = 1 verb_max_change = 2 adj_count = 0 noun_count = 0 verb_count = 0 ''' # http://www.clips.ua.ac.be/pages/pattern-de word = "Katze" print singularize(word) print pluralize(word) sys.exit(0) pprint.pprint(Tokens) sys.exit(0) ''' for o in Tokens: j = str(o) new_word = unicode(j, "utf-8") if (o.pos_ == "NOUN"): if noun_count == noun_max_change: nDoc += " "+new_word else: ew = exchange_word(new_word) nDoc += " "+''+ew+'' noun_count += 1 elif (o.pos_ == "VERB"): if verb_count == verb_max_change: nDoc += " "+new_word else: #print ("Verb zum Austausch gefunden\n") # new_word = ''+w+'' ew = exchange_word(new_word) nDoc += " "+''+ew+'' verb_count += 1 elif (o.pos_ == "ADJ"): if adj_count == adj_max_change: nDoc += " "+new_word else: ew = exchange_word(new_word) nDoc += " "+''+ew+'' adj_count += 1 else: nDoc += " "+new_word print("Originalsatz: " + oDoc) print("Ausgabesatz: " + nDoc) # process Unicode text with codecs.open("/home/www/www.onetopp.com/100biere/output.txt",'w',encoding='utf8') as f: f.write(nDoc) print("Script Runtime: --- %s seconds ---" % (time.time() - start_time)) sys.exit(0)