# -*- coding: utf-8 -*- #!/usr/bin/env python """ python3 DowserySummaryPreprepareADV.py python3 textsum_data_convert.py --command text_to_binary --in_directories train_data --out_files binary_data2/dowsery-train.bin,binary_data2/dowsery-validation.bin,binary_data2/dowsery-test.bin --split 1.0,0,0 python3 textsum_data_convert.py --command text_to_vocabulary --in_directories train_data --out_files binary_data2/vocab.bin python3 pointer-generator/run_summarization.py --mode=train --data_path=/home/Framework/Prototyp/tensorflow-text-summary/binary_data2/dowsery-* --vocab_path=/home/Framework/Prototyp/tensorflow-text-summary/binary_data2/vocab.bin --log_root=/home/Framework/Prototyp/tensorflow-text-summary/binary_data2 --exp_name=myexperiment2 -> Single Pass python3 pointer-generator/run_summarization.py --mode=decode --data_path=/home/Framework/Prototyp/tensorflow-text-summary/binary_data/dowsery-train.bin --vocab_path=/home/Framework/Prototyp/tensorflow-text-summary/binary_data/vocab.bin --log_root=/home/Framework/Prototyp/tensorflow-text-summary/binary_data --exp_name=myexperiment https://github.com/pltrdy/pointer-generator """ from colored import fg, bg, attr import translitcodec import unicodedata import os import sys, time import MySQLdb as mdb import codecs import os import re import sys import codecs import string import time import glob import getopt import argparse from unidecode import unidecode from datetime import datetime as dTime from pprint import PrettyPrinter file_path = "/home/Framework/Prototyp/dowsery-demo/train_data/" os.makedirs(file_path, exist_ok=True) file_path1 = "/home/Framework/Prototyp/dowsery-demo/binary_data/" os.makedirs(file_path1, exist_ok=True) #SqlQuery = "SELECT DISTINCT * FROM openjurv4 WHERE 1=1 AND p_hasleitsatz=1 ORDER BY RAND() LIMIT 25;"; # db:dowery_prototype SqlQuery = "SELECT DISTINCT * FROM openjurv4 WHERE 1=1 AND p_hasleitsatz=1 LIMIT 25000;"; # db:dowery_prototype # https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-17.06.txt.gz def encodeToUTF8Adv(text): encResults = text.encode('utf-8', "ignore") #return str(encResults.decode('latin-1', "ignore")) return str(encResults.decode('utf-8', "remove")) def encodeToLatin1(text): #n_String=replaceUmlauts(text) encResults = text.encode('utf-8', "ignore") #encResults = text.encode('utf-8', "ignore") return str(encResults.decode('latin-1', "ignore")) def all_same(items): return all(x == items[0] for x in items) print() print(SqlQuery) print() # open a database connection # be sure to change the host IP address, username, password and database name to match your own connection = mdb.connect (unix_socket = '/var/run/mysqld/mysqld.sock', host = "localhost", user = "root", passwd = "###########99", db = "dowery_prototype") # prepare a cursor object using cursor() method cursor = connection.cursor (mdb.cursors.DictCursor) # execute the SQL query using execute() method. cursor.execute (SqlQuery) # fetch all of the rows from the query # print the rows result_set = cursor.fetchall() c_Count=1 ### ###### Hole die Ergebnisse von der Echtzeit Suche ### #fo = open("/home/Framework/Prototyp/amazon-reviews/reviews1.csv", "a+", encoding="utf-8") #fo.write(str("Id;ProductId;UserId;ProfileName;HelpfulnessNumerator;HelpfulnessDenominator;Score;Time;Summary;Text\n")) #fo.close() count=0 for row in result_set: p_uid=str(row["p_uid"]) p_shortid=str(row["p_shortid"]) p_gruendePlain=str(row["p_gruendePlain"]) p_leitsatz=str(row["p_leitsatz"]) p_gruendePlain=p_gruendePlain.replace(";", " - ") p_leitsatz=p_leitsatz.replace(";", " - ") p_gruendePlain=p_gruendePlain.strip() p_leitsatz=p_leitsatz.strip() table = { ord('ä'): 'ae', ord('ö'): 'oe', ord('ü'): 'ue', ord('Ä'): 'Ae', ord('Ö'): 'Oe', ord('Ü'): 'Ue', ord('ß'): 'ss', } p_leitsatz=p_leitsatz.translate(table) p_gruendePlain=p_gruendePlain.translate(table) #p_leitsatz=p_leitsatz.encode().decode('latin-1').encode('translit/long').encode('ascii') #p_gruendePlain=p_gruendePlain.encode().decode('latin-1').encode('translit/long').encode('ascii') fo = open(file_path+"/"+"file-"+str(count), "w", encoding="utf-8") #fo.write(str("LEITSATZ====="+p_leitsatz+"\n"+"GRUENDE====="+p_gruendePlain+"\n")) fo.write(str(p_leitsatz+"\n"+p_gruendePlain+"\n")) fo.close() count=count+1 # close the cursor object cursor.close() # close the connection connection.close() print("####################################") print("###### BIN FERTIG ##################") print("####################################") sys.exit()