#!/usr/bin/python # -*- coding: UTF-8 -*- from __future__ import division from __future__ import absolute_import from __future__ import division, print_function, unicode_literals ########################### ### Autor: Sebastian Enger / M.Sc. ### Copyright: Sebastian Enger ### Licence: Commercial / OneTipp ### Version: 1.0.0c - 4-11-2015@23:53 Uhr ### Contact: sebastian.enger@gmail.com ### OneTipp Text Tool in Python: Main File ########################### """ Synonym bzw Wortersetzung parallelisieren für schnellere Verarbeitung und Reaktionszeit des Tools Antonym Datenbank Entwicklung mit Hilfe gecrawlter Websites Aufbau einer Datenbank mit einfacher deutscher Sprache Berechnung des lesbarkeitswerts eines eingabetextes - basierend auf einfachen Texten die "simple German " Datenbank für Austausch nutzen, Wissenschaftliche Texte mit Leipzig und unserer lokalen Synonym Datenbank austauschen Tests am 29.10.2015: https://github.com/rsennrich/clevertagger """ # https://docs.python.org/2/library/configparser.html import os import sys reload(sys) sys.setdefaultencoding("utf-8") sys.path.append('/home/onetipp/python/modules') os.environ['PYTHON_EGG_CACHE'] = '/home/compress/' import random import codecs import re import mod import stopwords import pprint import pattern.de from pattern.de import conjugate from pattern.de import INFINITIVE, PRESENT, PAST, SG, SUBJUNCTIVE from textblob_de import TextBlobDE as TextBlob from textblob_de import PatternTagger from textblob_de import TextBlobDE import treetaggerwrapper from pattern.de import article, DEFINITE, INDEFINITE, FEMALE, OBJECT, gender, MALE, FEMALE, NEUTRAL # cursorMysql = mod.mysql.cursor() noDoubleHash = set() re_match = r"(\?|\.|\!)" # Match: ". WORT" # # sent_tokenize_list = sent_tokenize(text) # # Summarize the text first and then work on it # tSumy = mod.summarizeText(text) # #tokens = mod.nltk.word_tokenize(tSumy) # tokens = mod.nltk.sent_tokenize(tSumy, language='german') # tokensRaw = mod.nltk.word_tokenize(text) # cursorMysql.execute("SELECT p_articletext FROM (publish_de) ORDER BY RAND() LIMIT 1;") # cursorMysql.execute("SELECT p_articletext FROM (publish_de) WHERE BINARY `id` = '%s' LIMIT 1;" % (word)) import re # https://perso.limsi.fr/pointal/doku.php?id=dev:treetaggerwrapper # https://subversion.renater.fr/ttpw/trunk/treetaggerwrapper.py # http://treetaggerwrapper.readthedocs.org/en/latest/#polls-of-taggers-process # result = cursorMysql.fetchall() # lies die Ein und Ausgabedateien inputfile = sys.argv[1] # read file into string # text = codecs.open(inputfile, "r", encoding='utf-8').read() text = codecs.open(inputfile, "r").read() tagger = treetaggerwrapper.TreeTagger(TAGLANG='de', TAGDIR='/home/onetipp/software/treetagger/') GermanStopwords = stopwords.getGermanStopwords() GermanSTTLIgnoreTags = stopwords.getSttsIgnoreTags() tokens = mod.nltk.sent_tokenize(text, language='german') # http://www.clips.ua.ac.be/pages/pattern-de list_conjugate = [ "VAFIN", "VVFIN", ] list_POS_NENN_VV_LOGIC = [ "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", "VMINF", "VMPP", ] list_POS_NENN_LOGIC = [ "NE", ] list_POS_ARTIKELCHANGE_LOGIC = [ "der", "die", "das", ] ListFinal = [] appendCurrentWord = 0 noAppendNextFlag = 0 lastWord = "" currentWord = "" noDoubleHash = set() UseSyn = None firstBestSynHit = None firstBestSynHitRank = None synDictLeipzig = {} ignoreNextWord = 0 WordCounter = [] for i, s in enumerate(tokens): # print(s,i) if s is not None: # print("Satz: ", s) unicode_text = mod.safe_unicode(s) # tSumy = mod.summarizeText(r) tags = tagger.tag_text(unicode_text) tags2 = treetaggerwrapper.make_tags(tags) for j, ele in enumerate(tags2): if ele: word_tmp = ele[0] unicode_text = mod.safe_unicode(ele[0]) word = unicode_text.encode('utf-8') WordCounter.append(1) # todo: if POS-TAG==NE and NEXT-POS-TAG == VV VFIN etc then VV or VVFIN bleibt so wie es ist. # pos_tag = ele[1].encode("ascii") pos_tag_tmp = ele[1] unicode_text = mod.safe_unicode(ele[1]) pos_tag = unicode_text.encode('utf-8') up = j + 1 if (up < len(tags2)): nextelem = tags2[up] else: nextelem is None if nextelem: unicode_text2 = mod.safe_unicode(nextelem[0]) wordNext = unicode_text2.encode('utf-8') unicode_text3 = mod.safe_unicode(nextelem[1]) pos_tagNext = unicode_text3.encode('utf-8') # print("Current word:", word) # print("\tCurrent POS:", pos_tag) # print("Next word:", wordNext) if (pos_tag == "ART" and pos_tagNext == "NN") and len(WordCounter) >= 5: # print("Current word:", word) # print("\tCurrent POS:", pos_tag) # print("Next word:", wordNext) sLeipzigList = mod.getSynLeipzig(wordNext) synDictLeipzig = {} if sLeipzigList: for wSynL in sLeipzigList: #synDict[SynRanker(wSyn, word)] = wSyn if wSynL not in noDoubleHash: synDictLeipzig[wSynL] = mod.SynRanker(wSynL, word) else: 1 sortedSynList = [] sortedSynList = sorted(synDictLeipzig.items(), key=lambda x: x[1], reverse=True) if (len(sortedSynList)) >= 3: randElement = random.randint(0,3) firstBestSynHit = mod.safe_unicode(sortedSynList[randElement][0]) firstBestSynHit = firstBestSynHit.encode('utf-8') firstBestSynHitRank = mod.safe_unicode(sortedSynList[randElement][1]) firstBestSynHitRank = firstBestSynHitRank.encode('utf-8') else: # hier später usnere lokale Synonym DB anfragen #pprint.pprint(sortedSynList) firstBestSynHit = wordNext firstBestSynHit = wordNext firstBestSynHitRank = 1 firstBestSynHitRank = 1 ListFinal.append(word) ListFinal.append("") #ListFinal.append(firstBestSynHit) ListFinal.insert(len(ListFinal), wordNext) ListFinal.append("") ignoreNextWord = 1 continue #print("Wort: ", word) # print ("SynBestHit: ",firstBestSynHit) # print(firstBestSynHitRank) #print(type(firstBestSynHitRank)) # if firstBestSynHitRank < 0: # print("Negative") synDictLeipzig = {} tmpRank = 0 UseSyn = True # wenn negative SynRank von gewähltem Synonym if firstBestSynHitRank < 0: firstBestSynHit = mod.safe_unicode(sortedSynList[0][0]) firstBestSynHit = firstBestSynHit.encode('utf-8') tmpRank = mod.safe_unicode(sortedSynList[0][1]) tmpRank = tmpRank.encode('utf-8') if tmpRank < 0: UseSyn = False if UseSyn and UseSyn is True: ListFinal.append(word) ListFinal.append("") #ListFinal.append(firstBestSynHit) ListFinal.insert(len(ListFinal), firstBestSynHit) ListFinal.append("") ignoreNextWord = 1 WordCounter = [] continue noDoubleHash.add(firstBestSynHit) # if pos_tag == "ART" and word.lower() in list_POS_ARTIKELCHANGE_LOGIC: # try: # myGender = gender(wordNext) # currentWord = article(wordNext, gender=myGender) # # print("Austausch:", word, " mit: ", currentWord, "
") # word = currentWord # appendCurrentWord = 1 # ListFinal.append("") # ListFinal.append(currentWord) # ListFinal.append("") # continue # except: # 1 # # if POS is ART then wordNext -> artice(funktion) und ersetzte ART durch articel() # wenn aktueller Eintrag ein NE - Eigenname - ist if pos_tag in list_POS_NENN_LOGIC: if pos_tagNext in list_POS_NENN_VV_LOGIC: noAppendNextFlag = 1 lastWord = wordNext ListFinal.append(word) # print("Current word:", word) # print("\tCurrent POS:", pos_tag) # print("Ignore Next word:", wordNext) # print("\tNext POS:", pos_tagNext) continue if ignoreNextWord == 1: ignoreNextWord = 0 continue if pos_tag not in GermanStopwords and pos_tag not in GermanStopwords: # print("Pos tag to possible Change:",pos_tag) # print("Word:", word) if pos_tag in list_conjugate: conj_tmp = conjugate(word, PAST, 1, SG, mood=SUBJUNCTIVE) unicode_text = mod.safe_unicode(conj_tmp) conj = unicode_text.encode('utf-8') if noAppendNextFlag == 1: ListFinal.append(word) noAppendNextFlag = 0 else: ListFinal.append("") ListFinal.append(conj) ListFinal.append("") continue # print("Word Past: ", conj, " - Lenght: " ,len(ListFinal) ,"
") else: 1 ListFinal.append(word) continue else: 1 ListFinal.append(word) # https://pypi.python.org/pypi/languagedet # file schreiben # readabilityVar = str(mod.textstat.flesch_reading_ease(text)) writeThis = " ".join(ListFinal) writeThis.encode('utf-8') with codecs.open("/tmp/onetipp_tmp.txt", 'wb+', encoding='utf-8') as f: f.write(writeThis) f.close() # mod.mysql.commit() # mod.mysql.close() # # mod.sphinx.commit() # mod.sphinx.close() exit(0) """ The Flesch Reading Ease formula function name - flesch_reading_ease(text) returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document. 90-100 : Very Easy 80-89 : Easy 70-79 : Fairly Easy 60-69 : Standard 50-59 : Fairly Difficult 30-49 : Difficult 0-29 : Very Confusing """