#!/usr/bin/python
# -*- coding: UTF-8 -*-
from __future__ import division
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
###########################
### Autor: Sebastian Enger / M.Sc.
### Copyright: Sebastian Enger
### Licence: Commercial / OneTipp
### Version: 1.0.0c - 2-11-2015@23:53 Uhr
### Contact: sebastian.enger@gmail.com
### OneTipp Text Tool in Python: Main File
###########################
"""
Synonym bzw Wortersetzung parallelisieren für schnellere Verarbeitung und Reaktionszeit des Tools
Antonym Datenbank Entwicklung mit Hilfe gecrawlter Websites
Aufbau einer Datenbank mit einfacher deutscher Sprache
Berechnung des lesbarkeitswerts eines eingabetextes - basierend auf einfachen Texten die "simple German " Datenbank für Austausch nutzen, Wissenschaftliche Texte mit Leipzig und unserer lokalen Synonym Datenbank austauschen
Tests am 29.10.2015:
https://github.com/rsennrich/clevertagger
"""
#https://docs.python.org/2/library/configparser.html
import os
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
sys.path.append('/home/onetipp/python/modules')
os.environ['PYTHON_EGG_CACHE'] = '/home/compress/'
import random
import codecs
import re
import mod
import stopwords
import pprint
import pattern.de
from pattern.de import conjugate
from pattern.de import INFINITIVE, PRESENT, PAST, SG, SUBJUNCTIVE
from textblob_de import TextBlobDE as TextBlob
from textblob_de import PatternTagger
from textblob_de import TextBlobDE
import treetaggerwrapper
#cursorMysql = mod.mysql.cursor()
noDoubleHash = set()
re_match = r"(\?|\.|\!)" # Match: ". WORT"
# # sent_tokenize_list = sent_tokenize(text)
# # Summarize the text first and then work on it
# tSumy = mod.summarizeText(text)
# #tokens = mod.nltk.word_tokenize(tSumy)
# tokens = mod.nltk.sent_tokenize(tSumy, language='german')
# tokensRaw = mod.nltk.word_tokenize(text)
#cursorMysql.execute("SELECT p_articletext FROM (publish_de) ORDER BY RAND() LIMIT 1;")
#cursorMysql.execute("SELECT p_articletext FROM (publish_de) WHERE BINARY `id` = '%s' LIMIT 1;" % (word))
import re
# https://perso.limsi.fr/pointal/doku.php?id=dev:treetaggerwrapper
# https://subversion.renater.fr/ttpw/trunk/treetaggerwrapper.py
# http://treetaggerwrapper.readthedocs.org/en/latest/#polls-of-taggers-process
# result = cursorMysql.fetchall()
# lies die Ein und Ausgabedateien
inputfile = sys.argv[1]
# read file into string
# text = codecs.open(inputfile, "r", encoding='utf-8').read()
text = codecs.open(inputfile, "r").read()
tagger = treetaggerwrapper.TreeTagger(TAGLANG='de', TAGDIR='/home/onetipp/software/treetagger/')
GermanStopwords = stopwords.getGermanStopwords()
GermanSTTLIgnoreTags = stopwords.getSttsIgnoreTags()
tokens = mod.nltk.sent_tokenize(text, language='german')
#http://www.clips.ua.ac.be/pages/pattern-de
list_conjugate = [
"VAFIN",
"VVFIN",
]
ListFinal = []
for s in tokens:
if s is not None:
#print("Satz: ", s)
unicode_text = mod.safe_unicode(s)
#tSumy = mod.summarizeText(r)
tags = tagger.tag_text(unicode_text)
tags2 = treetaggerwrapper.make_tags(tags)
#pprint.pprint(tags2)
for ele in tags2:
if ele:
word_tmp = ele[0]
unicode_text = mod.safe_unicode(ele[0])
word = unicode_text.encode('utf-8')
# Sie zeigt auf der Karte wo die Stadt Moskau ist.
# Neu: Sie zeigt auf der Karte wo die Stadt Moskau wäre.
#todo: if POS-TAG==NE and NEXT-POS-TAG == VV VFIN etc then VV or VVFIN bleibt so wie es ist.
#pos_tag = ele[1].encode("ascii")
pos_tag_tmp = ele[1]
unicode_text = mod.safe_unicode(ele[1])
pos_tag = unicode_text.encode('utf-8')
# print("
) Wort:", word, " > Pos:", pos_tag, "
")
if pos_tag not in GermanStopwords and pos_tag not in GermanStopwords:
# print("Pos tag to possible Change:",pos_tag)
# print("Word:", word)
if pos_tag in list_conjugate:
conj_tmp = conjugate(word, PAST, 1, SG, mood=SUBJUNCTIVE)
unicode_text = mod.safe_unicode(conj_tmp)
conj = unicode_text.encode('utf-8')
# print("Word Past: ", conj, " - Lenght: " ,len(ListFinal) ,"
")
ListFinal.append("")
ListFinal.append(conj)
ListFinal.append("")
continue
else: # stopwordlist
1
# udata=word.decode("utf-8")
# asciidata=udata.encode("ascii","ignore")
ListFinal.append(word)
#https://pypi.python.org/pypi/languagedet
# file schreiben
#readabilityVar = str(mod.textstat.flesch_reading_ease(text))
writeThis = " ".join(ListFinal)
writeThis.encode('utf-8')
with codecs.open("/tmp/onetipp_tmp.txt", 'wb+', encoding='utf-8') as f:
f.write(writeThis)
f.close()
# mod.mysql.commit()
# mod.mysql.close()
#
# mod.sphinx.commit()
# mod.sphinx.close()
exit(0)
"""
The Flesch Reading Ease formula
function name - flesch_reading_ease(text)
returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document.
90-100 : Very Easy
80-89 : Easy
70-79 : Fairly Easy
60-69 : Standard
50-59 : Fairly Difficult
30-49 : Difficult
0-29 : Very Confusing
"""