#!/usr/bin/python
# -*- coding: UTF-8 -*-
from __future__ import division
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
###########################
### Autor: Sebastian Enger / M.Sc.
### Copyright: Sebastian Enger
### Licence: Commercial / OneTipp
### Version: 1.0.0c - 2-11-2015@23:53 Uhr
### Contact: sebastian.enger@gmail.com
### OneTipp Text Tool in Python: Main File
###########################
"""
Synonym bzw Wortersetzung parallelisieren für schnellere Verarbeitung und Reaktionszeit des Tools
Antonym Datenbank Entwicklung mit Hilfe gecrawlter Websites
Aufbau einer Datenbank mit einfacher deutscher Sprache
Berechnung des lesbarkeitswerts eines eingabetextes - basierend auf einfachen Texten die "simple German " Datenbank für Austausch nutzen, Wissenschaftliche Texte mit Leipzig und unserer lokalen Synonym Datenbank austauschen
Tests am 29.10.2015:
https://github.com/rsennrich/clevertagger
"""
# https://docs.python.org/2/library/configparser.html
import os
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append('/home/onetipp/python/modules')
os.environ['PYTHON_EGG_CACHE'] = '/home/compress/'
import random
import codecs
import re
import mod
import stopwords
import pprint
import pattern.de
from pattern.de import conjugate
from pattern.de import INFINITIVE, PRESENT, PAST, SG, SUBJUNCTIVE
from textblob_de import TextBlobDE as TextBlob
from textblob_de import PatternTagger
from textblob_de import TextBlobDE
import treetaggerwrapper
from pattern.de import article, DEFINITE, INDEFINITE, FEMALE, OBJECT, gender, MALE, FEMALE, NEUTRAL
# cursorMysql = mod.mysql.cursor()
noDoubleHash = set()
re_match = r"(\?|\.|\!)" # Match: ". WORT"
# # sent_tokenize_list = sent_tokenize(text)
# # Summarize the text first and then work on it
# tSumy = mod.summarizeText(text)
# #tokens = mod.nltk.word_tokenize(tSumy)
# tokens = mod.nltk.sent_tokenize(tSumy, language='german')
# tokensRaw = mod.nltk.word_tokenize(text)
# cursorMysql.execute("SELECT p_articletext FROM (publish_de) ORDER BY RAND() LIMIT 1;")
# cursorMysql.execute("SELECT p_articletext FROM (publish_de) WHERE BINARY `id` = '%s' LIMIT 1;" % (word))
import re
# https://perso.limsi.fr/pointal/doku.php?id=dev:treetaggerwrapper
# https://subversion.renater.fr/ttpw/trunk/treetaggerwrapper.py
# http://treetaggerwrapper.readthedocs.org/en/latest/#polls-of-taggers-process
# result = cursorMysql.fetchall()
# lies die Ein und Ausgabedateien
inputfile = sys.argv[1]
# read file into string
# text = codecs.open(inputfile, "r", encoding='utf-8').read()
text = codecs.open(inputfile, "r").read()
tagger = treetaggerwrapper.TreeTagger(TAGLANG='de', TAGDIR='/home/onetipp/software/treetagger/')
GermanStopwords = stopwords.getGermanStopwords()
GermanSTTLIgnoreTags = stopwords.getSttsIgnoreTags()
tokens = mod.nltk.sent_tokenize(text, language='german')
# http://www.clips.ua.ac.be/pages/pattern-de
list_conjugate = [
"VAFIN",
"VVFIN",
]
list_POS_NENN_VV_LOGIC = [
"VVFIN",
"VVIMP",
"VVINF",
"VVIZU",
"VVPP",
"VAFIN",
"VAIMP",
"VAINF",
"VAPP",
"VMFIN",
"VMINF",
"VMPP",
]
list_POS_NENN_LOGIC = [
"NE",
]
list_POS_ARTIKELCHANGE_LOGIC = [
"der",
"die",
"das",
]
ListFinal = []
appendCurrentWord = 0
noAppendNextFlag = 0
lastWord = ""
currentWord = ""
for i, s in enumerate(tokens):
# print(s,i)
if s is not None:
# print("Satz: ", s)
unicode_text = mod.safe_unicode(s)
# tSumy = mod.summarizeText(r)
tags = tagger.tag_text(unicode_text)
tags2 = treetaggerwrapper.make_tags(tags)
for j, ele in enumerate(tags2):
if ele:
word_tmp = ele[0]
unicode_text = mod.safe_unicode(ele[0])
word = unicode_text.encode('utf-8')
# todo: if POS-TAG==NE and NEXT-POS-TAG == VV VFIN etc then VV or VVFIN bleibt so wie es ist.
# pos_tag = ele[1].encode("ascii")
pos_tag_tmp = ele[1]
unicode_text = mod.safe_unicode(ele[1])
pos_tag = unicode_text.encode('utf-8')
up = j + 1
if (up < len(tags2)):
nextelem = tags2[up]
else:
nextelem is None
if nextelem:
unicode_text2 = mod.safe_unicode(nextelem[0])
wordNext = unicode_text2.encode('utf-8')
unicode_text3 = mod.safe_unicode(nextelem[1])
pos_tagNext = unicode_text3.encode('utf-8')
# print("Current word:", word)
# print("\tCurrent POS:", pos_tag)
# print("Next word:", wordNext)
if pos_tag == "ART" and word.lower() in list_POS_ARTIKELCHANGE_LOGIC:
try:
myGender = gender(wordNext)
currentWord = article(wordNext, gender=myGender)
# print("Austausch:", word, " mit: ", currentWord, "
")
word = currentWord
appendCurrentWord = 1
ListFinal.append("")
ListFinal.append(currentWord)
ListFinal.append("")
continue
except:
1
# if POS is ART then wordNext -> artice(funktion) und ersetzte ART durch articel()
# wenn aktueller Eintrag ein NE - Eigenname - ist
if pos_tag in list_POS_NENN_LOGIC:
if pos_tagNext in list_POS_NENN_VV_LOGIC:
noAppendNextFlag = 1
lastWord = wordNext
ListFinal.append(word)
# print("Current word:", word)
# print("\tCurrent POS:", pos_tag)
# print("Ignore Next word:", wordNext)
# print("\tNext POS:", pos_tagNext)
continue
if pos_tag not in GermanStopwords and pos_tag not in GermanStopwords:
# print("Pos tag to possible Change:",pos_tag)
# print("Word:", word)
if pos_tag in list_conjugate:
conj_tmp = conjugate(word, PAST, 1, SG, mood=SUBJUNCTIVE)
unicode_text = mod.safe_unicode(conj_tmp)
conj = unicode_text.encode('utf-8')
if noAppendNextFlag == 1:
ListFinal.append(word)
noAppendNextFlag = 0
else:
ListFinal.append("")
ListFinal.append(conj)
ListFinal.append("")
continue
# print("Word Past: ", conj, " - Lenght: " ,len(ListFinal) ,"
")
else:
1
ListFinal.append(word)
continue
else:
1
ListFinal.append(word)
# https://pypi.python.org/pypi/languagedet
# file schreiben
# readabilityVar = str(mod.textstat.flesch_reading_ease(text))
writeThis = " ".join(ListFinal)
writeThis.encode('utf-8')
with codecs.open("/tmp/onetipp_tmp.txt", 'wb+', encoding='utf-8') as f:
f.write(writeThis)
f.close()
# mod.mysql.commit()
# mod.mysql.close()
#
# mod.sphinx.commit()
# mod.sphinx.close()
exit(0)
"""
The Flesch Reading Ease formula
function name - flesch_reading_ease(text)
returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document.
90-100 : Very Easy
80-89 : Easy
70-79 : Fairly Easy
60-69 : Standard
50-59 : Fairly Difficult
30-49 : Difficult
0-29 : Very Confusing
"""