#!/usr/bin/python
# -*- coding: ISO-8859-1 -*-
from __future__ import division
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

###########################
### Autor: Sebastian Enger / M.Sc.
### Copyright: Sebastian Enger
### Licence: Commercial / OneTipp
### Version: 1.0.1a  - 26-10-2015@23:53 Uhr
### Contact: sebastian.enger@gmail.com
### OneTipp Text Tool in Python: Main File
###########################

"""
Synonym bzw Wortersetzung parallelisieren für schnellere Verarbeitung und Reaktionszeit des Tools

Antonym Datenbank Entwicklung mit Hilfe gecrawlter Websites

Aufbau einer Datenbank mit einfacher deutscher Sprache

Berechnung des lesbarkeitswerts eines eingabetextes - basierend auf einfachen Texten die "simple German " Datenbank für Austausch nutzen, Wissenschaftliche Texte mit Leipzig und unserer lokalen Synonym Datenbank austauschen

"""


#https://docs.python.org/2/library/configparser.html
import os
import sys
reload(sys)
sys.path.append('/home/onetipp/python/modules')
os.environ['PYTHON_EGG_CACHE'] = '/home/compress/'

import random
import codecs
import re
import mod

from textblob_de import TextBlobDE as TextBlob
from textblob_de import PatternTagger
from textblob_de import TextBlobDE

cursorMysql = mod.mysql.cursor()

noDoubleHash    = set()
re_match        = r"(\?|\.|\!)" # Match: ". WORT"

# lies die Ein und Ausgabedateien
inputfile   = sys.argv[1]
outputfile  = sys.argv[2]

# read file into string
text = codecs.open(inputfile, "r", encoding='utf-8').read()

# sent_tokenize_list = sent_tokenize(text)
# Summarize the text first and then work on it
tSumy       = mod.summarizeText(text)
#tokens      = mod.nltk.word_tokenize(tSumy)
tokens      = mod.nltk.sent_tokenize(tSumy)
tokensRaw   = mod.nltk.word_tokenize(text)

count                   = -1
changeEveryWord         = 6 #Leistungsschutzrecht: 7 Zeichen dürfen genutzt werden, darüber muss geändert werden
changeEveryWordFlag     = 0
changeEveryWordTemp     = 0 #temporary upcount
ignoreNextWord          = 0

for word in tokens:

    count += 1
    word = mod.to_unicode(word)
    blob = TextBlobDE(word)
 
    for s, elem in enumerate(blob.tags):

        myWord = elem[0]
        myCatg = elem[1]

        if len(myWord) <= 3:
            continue

        sLeipzigList    = mod.getSynLeipzig(myWord)
        lstcWord        = word[0:1]
        synDictLeipzig  = {}

       # print(myWord, " <Word - Catg> ",myCatg, " SYN>")

        if sLeipzigList:
            for wSynL in sLeipzigList:
                #synDict[SynRanker(wSyn, word)] = wSyn
                if wSynL not in noDoubleHash:
                    synDictLeipzig[wSynL] = mod.SynRanker(wSynL, word)

                    print("Wort: ",myWord, " Synon ",wSynL, " -> SynRank: ", synDictLeipzig[wSynL])

                    sortedSynList       = []
                    sortedSynList       = sorted(synDictLeipzig.items(), key=lambda x: x[1], reverse=True)

                    randElement = random.randint(0, len(sortedSynList)-1)
                    firstBestSynHit     = mod.to_unicode(sortedSynList[randElement][0])
                    firstBestSynHitRank = mod.to_unicode(sortedSynList[randElement][1])




        1
    1


# file schreiben
outputtext          = ' '.join(tokens)
outputtextRaw       = ' '.join(tokensRaw)

readabilityVar      = str(mod.textstat.flesch_reading_ease(text))

with codecs.open(outputfile, 'w', encoding='utf-8') as f:
    f.write(outputtext)
    f.close()

with codecs.open(outputfile+".raw.txt", 'w', encoding='utf-8') as f:
    f.write(outputtextRaw)
    f.close()

mod.mysql.commit()
mod.mysql.close()

mod.sphinx.commit()
mod.sphinx.close()

exit(0)


"""
The Flesch Reading Ease formula

function name - flesch_reading_ease(text)

returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document.

90-100 : Very Easy
80-89 : Easy
70-79 : Fairly Easy
60-69 : Standard
50-59 : Fairly Difficult
30-49 : Difficult
0-29 : Very Confusing

"""