# -*- coding: utf-8 -*- #!/usr/bin/python2.7 -S # python -m spacy.en.download python -m spacy.de.download # https://spacy.io/docs/#tutorials # CSS: http://codepen.io/explosion/pen/xEpgKz # CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer import time start_time = time.time() # pip install --upgrade thinc # pip3 install suds-jurko import spacy #import base64 import json #import site import codecs import locale import shelve import gensim, logging import pprint import os.path #import sphinxapi as SP from sphinxapi import * # pip install --upgrade mysql-python import MySQLdb # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # pip install --upgrade spacy # pip3 install --upgrade spacy # apt-get install python3-pip # pip install --upgrade pip # pip3 install --upgrade pip3 # pip install -U textblob # python -m textblob.download_corpora # pip install --upgrade langdetect # pip install -U textblob-de # python -m textblob.download_corpora #from textblob_de import TextBlobDE as TextBlob from langdetect import detect #from textblob_de import TextBlobDE #from textblob import TextBlob from libleipzig import * # https://github.com/palday/libleipzig-python # pip install --upgrade pattern import sys, time # import sys package, if not already imported reload(sys) sys.setdefaultencoding('utf-8') #sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout) synonym_cache = '/home/100biere/cache/synonym_cache.shelve' def get_synonyms(search): docids = [] resList = [] q = '@word '+search #q = '@synonyms Haus' mode = SPH_MATCH_EXTENDED host = 'localhost' port = 9312 index = 'onetopp' filtercol = 'group_id' filtervals = [] sortby = '@relevance desc' groupby = '' groupsort = '@group desc' limit = 5 # muss mindestens 1 sein weight = {'words': [100], 'synonyms': [65]} cl = SphinxClient() #cl.SetServer ( host, port ) cl.SetServer("/var/run/searchd.sock", 0) cl.SetConnectTimeout (7.3); cl.SetMatchMode (mode) ###cl.SetFieldWeights( weight ) # klappt noch nicht #cl.SetFilter( filtercol, filtervals ) cl.SetGroupBy(groupby, SPH_GROUPBY_ATTR, groupsort) cl.SetSortMode(SPH_SORT_EXTENDED, sortby) cl.SetLimits(0, limit, max(limit,10)) cl.SetRankingMode(SPH_RANK_BM25); # do query res = "" try: res = cl.Query( q, index ) except Exception: print('query failed: %s' % cl.GetLastError()) #sys.exit(1) return res; if cl.GetLastWarning(): print('WARNING: %s\n' % cl.GetLastWarning()) # print('Query \'%s\' retrieved %d of %d matches in %s sec' % (q, res['total'], res['total_found'], res['time'])) # print('Query stats:') # if 'words' in res: # for info in res['words']: # print('\t\'%s\' found %d times in %d documents' % (info['word'], info['hits'], info['docs'])) if 'matches' in res: n = 1 # print('\nMatches:') for match in res['matches']: # attrsdump = '' # for attr in res['attrs']: # attrname = attr[0] # attrtype = attr[1] # value = match['attrs'][attrname] # if attrtype==SPH_ATTR_TIMESTAMP: # value = time.strftime ( '%Y-%m-%d %H:%M:%S', time.localtime(value) ) # attrsdump = '%s, %s=%s' % ( attrsdump, attrname, value ) # print('%d. doc_id=%s, weight=%d%s' % (n, match['id'], match['weight'], attrsdump)) docids.append(match['id']) n += 1 #pprint.pprint(docids) SqlQuery = "SELECT word, synonyms FROM synonym_unique WHERE"; for i, val in enumerate(docids): sql_str = str(val) if i>0: #SqlQuery += "\""+sql_str+"\"," SqlQuery += sql_str+"," elif i<=0: SqlQuery += " `uid` IN (\""+sql_str+"\"," SqlQuery = SqlQuery[:-1] SqlQuery += ")" #pprint.pprint("Das ist mein SQL: "+SqlQuery) #SqlQuery = substr($SqlQuery,0,(strlen($SqlQuery)-1)); # Open database connection # https://www.tutorialspoint.com/python/python_database_access.htm if len(docids)>=1: #pprint.pprint("Das ist mein SQL: "+SqlQuery) try: dbs = MySQLdb.connect(user="root", passwd="###########99", host="127.0.0.1", unix_socket="/var/run/mysqld/mysqld.sock", port=3306, db="onetopp") # Execute the SQL command cursor = dbs.cursor() cursor.execute(SqlQuery) dbs.commit() # Fetch all the rows in a list of lists. results = cursor.fetchall() #pprint.pprint(results) #sys.exit(0) # disconnect from server cursor.close() dbs.close() for row in results: word = row[0] synonyms = row[1] w = synonyms.split(";") resList.append(w) # Now print fetched result # print "HITS: word=%s,synonyms=%s" % (word, synonyms ) except MySQLdb.Error, e: print "Error %d: %s" % (e.args[0],e.args[1]) print(cursor._last_executed) print "Error: unable to fetch data" #for row in cursor: # print(row) #my_list = set(resList) # make list unique unique = reduce(lambda l, x: l.append(x) or l if x not in l else l, resList, []) return unique def read_similarity(word, sentences): wv = 0.00000 if word and len(word)>=2 and len(sentences)>=2: path = "/home/100biere/pretrained/"+word+".w2v.bin" #print("Working on: "+word) # eventuell später: "similar_by_word(word, topn=10, restrict_vocab=None)" aus https://radimrehurek.com/gensim/models/word2vec.html verwenden, es gibt genau das passende wort dazu aus # eventuell dazu die gesamte Synonym SQL Datenbank mal einmal mit WordVec trainieren und dann nur noch auf das Modell zugreifen, wenn benötigt if os.path.isfile(path): print ("Trained Gensim Model for Word found: "+word) model = gensim.models.Word2Vec.load(path) sim = model.most_similar(positive=[word], negative=[], topn=1)#model.similar_by_word(word) pprint.pprint(sim) try: sim = model.most_similar(positive=[word], negative=[], topn=1) v = sim[0] vv = v[1] return vv except KeyError: 1 else: # do the training print ("Doing GENSIM Training -> NO Model for Word found: "+word) model = gensim.models.Word2Vec(sentences, sg=1, hs=1, iter=50, size=400, workers=1, sorted_vocab=1, alpha=0.325, min_count=1) model.init_sims(replace=False) # can read,write from, and also training -> more memory #model.init_sims(replace=True) # can only read from, but no more training -> less memory model.save(path) try: sim = model.most_similar(positive=[word], negative=[], topn=1) v = sim[0] vv = v[1] return vv except KeyError: 1 return wv def get_key(key): try: return int(key) except ValueError: return key def exchange_word(o): j = str(o) if len(j) <= 1: # Wenn Original Wort Länge mehr als 2 Zeichen ist return o ww = unicode(j, "utf-8") r = get_synonyms(ww) #pprint.pprint(ww) if not r: return j words = r #r.split(",") #print ("Exchange Word: exchange_word(o)") n = {} nn = "" for mm in words: m = mm[0] # print(type(m)) # pprint.pprint(m) # my_a = m #m[m.find('"')+len('"'):m.rfind('"')] # my_b = m #unicode(my_a, "utf-8") if len(m) >= 2: # Wenn Synonym Länge mehr als 2 Zeichen ist # print(type(m)) # print(m) rs = read_similarity(m, r) # print("for m in words: Similarity:"), # print(rs) n[m] = rs #print(type(my_b)) #print("String: "+my_b) #sys.exit(0) #print(type(words)) #print("no schrottinger") #sys.exit(1) # for i, val in enumerate(r): # print(val) # w = val[0] # if len(w) >= 2: # Wenn Synonym Länge mehr als 2 Zeichen ist # rs = read_similarity(w, r) # n[w] = rs # # sys.exit(1) ###print ("Synonyme mit Similarity Score:") ###pprint.pprint(n) if len(n) >= 1: # hole das Element aus dem Dict, das den höhsten Similarity Score hat nn = max(n, key=lambda key: n[key]) # wenn das Originalwort und das Synonym gleich sind, dann lösche es aus dem Dict und teste neu #nn = unicode(nn, "utf-8") nn = nn.decode("utf-8", "ignore") if nn == ww: print("Originalwort und das Synonym gleich sind -> NEU") del n[nn] if len(n) > 0: nn = max(n, key=lambda key: n[key]) nn = unicode(nn, "utf-8") rsr = read_similarity(nn, r) ### if rsr <= 0.011: ### return ww print("\tOriginal: "+ww+" -> Synonym: "+nn) print("\tSynonym Similarity Score:"), print(rsr) #print("\tAlle Synonyme mit Similarity Score:") #pprint.pprint(n) return nn #oDoc = (u'''Auf der Autobahn fahren viele Autos mit hoher Geschwindigkeit.''') #oDoc = (u"Der Geschäftsführer ist im Meeting und fragt nach der Telefonnummer von seiner schönen Frau. Sie ist gerade mit ihren kleinen Kindern im Kindergarten und trifft danach ihre Freundinnen im Cafe.") with codecs.open("/home/100biere/input.txt",'r',encoding='utf8') as f: oDoc = f.read() lang = detect(oDoc) print ("Language detected: "+lang) print ("Text: "+oDoc) if lang == "en": de_nlp = spacy.load('en', tagger=True, parser=False, entity=False) from textblob import TextBlob from textblob import Word from pattern import singularize, pluralize elif lang == "de": de_nlp = spacy.load('de', tagger=True, parser=False, entity=False) from textblob_de import TextBlobDE as TextBlob from textblob_de import Word from pattern.de import singularize, pluralize else: de_nlp = spacy.load('de', tagger=True, parser=False, entity=False) from textblob_de import TextBlobDE as TextBlob from textblob_de import Word from pattern.de import singularize, pluralize print ("Tagging & Tokenizing Text") oDoc_nlp = de_nlp(oDoc, tag=True, parse=False) nDoc = "" Tokens = [o for o in oDoc_nlp] noun_max_change = 4 adj_max_change = 1 verb_max_change = 1 adj_count = 0 noun_count = 0 verb_count = 0 ''' # http://www.clips.ua.ac.be/pages/pattern-de word = "Katze" print singularize(word) print pluralize(word) sys.exit(0) pprint.pprint(Tokens) sys.exit(0) ''' for o in Tokens: j = str(o) new_word = unicode(j, "utf-8") if (o.pos_ == "NOUN"): if noun_count == noun_max_change: nDoc += " "+new_word else: ew = exchange_word(new_word) nDoc += " "+''+ew+'' noun_count += 1 elif (o.pos_ == "VERB"): if verb_count == verb_max_change: nDoc += " "+new_word else: ew = exchange_word(new_word) nDoc += " "+''+ew+'' verb_count += 1 elif (o.pos_ == "ADJ"): if adj_count == adj_max_change: nDoc += " "+new_word else: ew = exchange_word(new_word) nDoc += " "+''+ew+'' adj_count += 1 else: nDoc += " "+new_word print("Originalsatz: " + oDoc) print("Ausgabesatz: " + nDoc) # process Unicode text with codecs.open("/home/100biere/output.txt",'w',encoding='utf8') as f: f.write(nDoc) print("Script Runtime: --- %s seconds ---" % (time.time() - start_time)) sys.exit(0)