# -*- coding: utf-8 -*- #!/usr/bin/python2.7 -S # python -m spacy.en.download python -m spacy.de.download # https://spacy.io/docs/#tutorials # CSS: http://codepen.io/explosion/pen/xEpgKz # CSS 2: https://explosion.ai/blog/displacy-ent-named-entity-visualizer import time start_time = time.time() ''' >>> 'abc'.decode('utf-8') # str to unicode u'abc' >>> u'abc'.encode('utf-8') # unicode to str 'abc' ''' # pip install --upgrade thinc # pip3 install suds-jurko import spacy #import base64 import json #import site import codecs import locale import shelve import gensim, logging import pprint import os.path #import sphinxapi as SP from sphinxapi import * # pip install --upgrade mysql-python import MySQLdb logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # pip3 install --upgrade spacy # apt-get install python3-pip # pip install --upgrade pip # pip3 install --upgrade pip3 # pip install -U textblob # python -m textblob.download_corpora # pip install --upgrade langdetect # pip install -U textblob-de # python -m textblob.download_corpora #from textblob_de import TextBlobDE as TextBlob from langdetect import detect #from textblob_de import TextBlobDE #from textblob import TextBlob from libleipzig import * # pip install --upgrade libleipzig # pip install --upgrade libleipzig # pip3 install --upgrade libleipzig # pip install --upgrade pattern import sys, time # import sys package, if not already imported reload(sys) sys.setdefaultencoding('utf-8') #sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout) ###SqlQuery = "SELECT * FROM `synonym_unique` WHERE `word` LIKE 'Zusammenhalt'" SqlQuery = "SELECT word, synonyms, uid FROM synonym_unique WHERE 1 ORDER BY uid ASC" #pprint.pprint(SqlQuery) # Open database connection # https://www.tutorialspoint.com/python/python_database_access.htm def do_training(sentences, word): path = "/home/100biere/pretrained"+"/"+word.lower()+".w2v.bin" print("Working on: "+word) model = gensim.models.Word2Vec(sentences, hs=1, iter=400, size=400, workers=1, sorted_vocab=1, alpha=0.325, min_count=1) model.init_sims(replace=False) # can read,write from, and also training -> more memory #model.init_sims(replace=True) # can only read from, but no more training -> less memory model.save(path) return 1 try: dbs = MySQLdb.connect(user="root", passwd="###########99", host="127.0.0.1", unix_socket="/var/run/mysqld/mysqld.sock", port=3306, db="onetopp") # Execute the SQL command cursor = dbs.cursor() cursor.execute(SqlQuery) dbs.commit() # Fetch all the rows in a list of lists. results = cursor.fetchall() # pprint.pprint(results) #sys.exit(0) res = [] for row in results: word = row[0] synonyms = row[1] uid = row[2] # Now print fetched result #print "HITS: word=%s,synonyms=%s" % (word, synonyms ) w = synonyms.split(";") my_w = word.encode("latin-1") do_training(synonyms, my_w) file = codecs.open("/home/100biere/pretrained/trained.log", "a", "utf-8") file.write('{:08d}\n'.format(uid)) file.write(" -> word: "+my_w+"\n"), file.close() except e: #print "Error %d: %s" % (e.args[0],e.args[1]) print(cursor._last_executed) print ("Error: unable to fetch data") #for row in cursor: # print(row) # disconnect from server cursor.close() dbs.close() print("Script Runtime: --- %s seconds ---" % (time.time() - start_time)) sys.exit(0)