#!/usr/bin/python # -*- coding: ISO-8859-1 -*- from __future__ import division from __future__ import absolute_import from __future__ import division, print_function, unicode_literals ########################### ### Autor: Sebastian Enger / M.Sc. ### Copyright: Sebastian Enger ### Licence: Commercial / OneTipp ### Version: 1.0.1a - 26-10-2015@23:53 Uhr ### Contact: sebastian.enger@gmail.com ### OneTipp Text Tool in Python: Main File ########################### """ Synonym bzw Wortersetzung parallelisieren für schnellere Verarbeitung und Reaktionszeit des Tools Antonym Datenbank Entwicklung mit Hilfe gecrawlter Websites Aufbau einer Datenbank mit einfacher deutscher Sprache Berechnung des lesbarkeitswerts eines eingabetextes - basierend auf einfachen Texten die "simple German " Datenbank für Austausch nutzen, Wissenschaftliche Texte mit Leipzig und unserer lokalen Synonym Datenbank austauschen """ #https://docs.python.org/2/library/configparser.html import os import sys reload(sys) sys.path.append('/home/onetipp/python/modules') os.environ['PYTHON_EGG_CACHE'] = '/home/compress/' import random import codecs import re import mod from textblob_de import TextBlobDE as TextBlob from textblob_de import PatternTagger from textblob_de import TextBlobDE cursorMysql = mod.mysql.cursor() noDoubleHash = set() re_match = r"(\?|\.|\!)" # Match: ". WORT" # lies die Ein und Ausgabedateien inputfile = sys.argv[1] outputfile = sys.argv[2] # read file into string text = codecs.open(inputfile, "r", encoding='utf-8').read() # sent_tokenize_list = sent_tokenize(text) # Summarize the text first and then work on it tSumy = mod.summarizeText(text) #tokens = mod.nltk.word_tokenize(tSumy) tokens = mod.nltk.sent_tokenize(tSumy) tokensRaw = mod.nltk.word_tokenize(text) count = -1 changeEveryWord = 6 #Leistungsschutzrecht: 7 Zeichen dürfen genutzt werden, darüber muss geändert werden changeEveryWordFlag = 0 changeEveryWordTemp = 0 #temporary upcount ignoreNextWord = 0 for word in tokens: count += 1 word = mod.to_unicode(word) blob = TextBlobDE(word) for s, elem in enumerate(blob.tags): myWord = elem[0] myCatg = elem[1] if len(myWord) <= 3: continue sLeipzigList = mod.getSynLeipzig(myWord) lstcWord = word[0:1] synDictLeipzig = {} # print(myWord, " ",myCatg, " SYN>") if sLeipzigList: for wSynL in sLeipzigList: #synDict[SynRanker(wSyn, word)] = wSyn if wSynL not in noDoubleHash: synDictLeipzig[wSynL] = mod.SynRanker(wSynL, word) print("Wort: ",myWord, " Synon ",wSynL, " -> SynRank: ", synDictLeipzig[wSynL]) sortedSynList = [] sortedSynList = sorted(synDictLeipzig.items(), key=lambda x: x[1], reverse=True) randElement = random.randint(0, len(sortedSynList)-1) firstBestSynHit = mod.to_unicode(sortedSynList[randElement][0]) firstBestSynHitRank = mod.to_unicode(sortedSynList[randElement][1]) 1 1 # file schreiben outputtext = ' '.join(tokens) outputtextRaw = ' '.join(tokensRaw) readabilityVar = str(mod.textstat.flesch_reading_ease(text)) with codecs.open(outputfile, 'w', encoding='utf-8') as f: f.write(outputtext) f.close() with codecs.open(outputfile+".raw.txt", 'w', encoding='utf-8') as f: f.write(outputtextRaw) f.close() mod.mysql.commit() mod.mysql.close() mod.sphinx.commit() mod.sphinx.close() exit(0) """ The Flesch Reading Ease formula function name - flesch_reading_ease(text) returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document. 90-100 : Very Easy 80-89 : Easy 70-79 : Fairly Easy 60-69 : Standard 50-59 : Fairly Difficult 30-49 : Difficult 0-29 : Very Confusing """