#!/usr/bin/python
# -*- coding: ISO-8859-1 -*-
from __future__ import division
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

###########################
### Autor: Sebastian Enger / M.Sc.
### Copyright: Sebastian Enger
### Licence: Commercial / OneTipp
### Version: 1.0.8  - 21-10-2015@23:53 Uhr
### Contact: sebastian.enger@gmail.com
### OneTipp Text Tool in Python
###########################

#https://docs.python.org/2/library/configparser.html

import os
import nltk                                            # apt-get install python-mysqldb
from sphinxit.core.processor import Search                  # http://sphinxit.readthedocs.org/en/latest/
import codecs
import re
from transliterate import translit, get_available_language_codes
import onetipp

import sys
sys.path.append('/home/onetipp/python/modules')
os.environ['PYTHON_EGG_CACHE'] = '/home/compress/'
sys.setdefaultencoding('utf-8')

noDoubleHash    = set()
###re_match        = r"[(\?|\.|\!)][(\t|\r|\n|\s|\w){0,}]([A-Za-z0-9]{1,})" # Match: ". WORT"
re_match        = r"(\?|\.|\!)$" # Match: ". WORT"

# lies die Ein und Ausgabedateien
inputfile   = sys.argv[1]
outputfile  = sys.argv[2]

# http://www.tutorialspoint.com/python/python_command_line_arguments.htm
# read file into string
text = open(inputfile, 'r').read()
#text.decode('utf-8')
text = text.decode("utf-8")

# sent_tokenize_list = sent_tokenize(text)
# Summarize the text first and then work on it
tSumy       = onetipp.summarizeText(text)
tokens      = nltk.word_tokenize(tSumy)
tokensRaw   = nltk.word_tokenize(text)

count                   = -1
changeEveryWord         = 8 #Leistungsschutzrecht: 7 Zeichen dürfen genutzt werden, darüber muss geändert werden
changeEveryWordFlag     = 0
changeEveryWordTemp     = 0 #temporary upcount

for word in tokens:
    count += 1

    wordTemp = word.encode('ascii', 'ignore')
    # cursorMysql.execute("SELECT * FROM (namen_table) WHERE name LIKE '%s%%' LIMIT 1;" % (word))
    onetipp.cursorMysql.execute("SELECT * FROM (namen_table) WHERE BINARY `name` = '%s' LIMIT 1;" % (wordTemp))
    name_content = onetipp.cursorMysql.fetchone()
    #print ("SELECT * FROM (namen_table) WHERE name LIKE '%s' LIMIT 1;" % (word))
    #print (name_content)

    #    search_query = Search(indexes=['onetipp_name'], config=SphinxitConfig)
    #    # search_query = search_query.match(word).options(
    #    search_query = search_query.match(word).options(
    #        ranker='proximity_bm25',
    #        max_matches=1,
    #        max_query_time=350,
    #        field_weights={'name': 100, 'gender': -10000, 'language': -10000, 'meaning': -10000},
    #    )
        ###sphinx_result = search_query.ask()
        # exit(0)

    # es wurde ein namen gefunden -> kein synonym austauschen
    if name_content is not None:
        # print("Token: ", tokens)
        #print("Count: ", count)
        #print("<br>")
        #print("Tokencount overall: ", len(tokens))
        #print("<br>")
     #   tokens[count] = '<b style="color:#00FFFF;" title="Namen erkannt"><i>' + deumlaut(word) + '</i></b>'
        tokens[count] = onetipp.deumlaut(word)
        tokensRaw[count] = onetipp.deumlaut(word)
        # print "Namen erkannt und nicht getauscht"
        continue
    else:
        1

    if changeEveryWordTemp == (changeEveryWord - 1):
        changeEveryWordFlag     = 0
        changeEveryWordTemp     = 0
    else:
        1

    if changeEveryWordFlag == 1:
        changeEveryWordTemp += 1
    else:
        1

    if len(word) >=4  and changeEveryWordFlag == 0:

        # Versuche zuerst die Leipzig DB anzufordern

        lstcWord        = word[0:1]
        synDictLeipzig  = {}
        sLeipzigList    = onetipp.getSynLeipzig(word)

        if sLeipzigList:
            for wSynL in sLeipzigList:
                #synDict[SynRanker(wSyn, word)] = wSyn
                if wSynL not in noDoubleHash:
                    synDictLeipzig[wSynL] = onetipp.SynRanker(wSynL, word)

            sortedSynList       = []
            sortedSynList       = sorted(synDictLeipzig.items(), key=lambda x: x[1], reverse=True)
            firstBestSynHit     = str(sortedSynList[0][0])
            firstBestSynHitRank = str(sortedSynList[0][1])

            # Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross
            if re.search(re_match, tokens[count-1]) is not None:
                firstBestSynHit.title()

            if word.endswith('.'):
                firstBestSynHit += '.'
            elif word.endswith('?'):
                firstBestSynHit += '?'
            elif word.endswith('!'):
                firstBestSynHit += '!'
            elif word.endswith(','):
                firstBestSynHit += ','
            elif word.endswith(';'):
                firstBestSynHit += ';'
            elif word.endswith(':'):
                firstBestSynHit += ':'

            # later: Randomly choose one of the synonyms that have all the highest rating
     #       tokens[count] = '<b style="color:#FF99FF; text-decoration: underline" title="SynRank(' + \
     #                       firstBestSynHitRank + ') /Leipzig DB/ ->Synonym ausgetauscht!"><i>' + deumlaut(
     # firstBestSynHit) + '</i></b>'

            tokens[count] = onetipp.deumlaut(firstBestSynHit)

            noDoubleHash.add(firstBestSynHit)
            tokensRaw[count]    = onetipp.deumlaut(firstBestSynHit)
            changeEveryWordFlag = 1
            changeEveryWordTemp += 1

        else:

            #nutze unsere lokale Synonym Mysql Datenbank
            search_query_syn = Search(indexes=['onetipp_syn_simple'], config=onetipp.SphinxitConfig)
            search_query_syn = search_query_syn.match(word).options(
                ranker='proximity_bm25',
                max_matches=1,
                max_query_time=350,
                field_weights={'synonyms': 100},
            )
            sphinx_result_syn = search_query_syn.ask()
            synID = 0

            try:
                synID = sphinx_result_syn['result']['items'][0].values()[0]
                if synID > 0:
                    #    print "SynDB has been found: ", synID

                    #später finde via sphinx noch mehr synonyme und parse diese alle
                    sql         = "SELECT synonyms FROM (synonym_unique_simple) WHERE uid= %s" % (synID)
                    onetipp.cursorMysql.execute(sql)
                    syn_content = onetipp.cursorMysql.fetchone()
                    synContent  = list(syn_content)
                    synContent  = synContent[0].decode(encoding="utf-8", errors="ignore")

                    if syn_content:
                        synwords = synContent.split(";")
                        # print SynDictCalculator(synwords)

                        #    http://www.saltycrane.com/blog/2007/09/how-to-sort-python-dictionary-by-keys/
                        #    for key, value in sorted(mydict.iteritems(), key=lambda (k,v): (v,k)):
                        #       print "%s: %s" % (key, value)

                        synDict = {}
                        for wSyn in synwords:
                            #synDict[SynRanker(wSyn, word)] = wSyn
                            if wSyn not in noDoubleHash:
                                synDict[wSyn] = onetipp.SynRanker(wSyn, word)

                        sortedSynList       = []
                        sortedSynList       = sorted(synDict.items(), key=lambda x: x[1], reverse=True)
                        firstBestSynHit     = str(sortedSynList[0][0])
                        firstBestSynHitRank = str(sortedSynList[0][1])

                        # Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross
                        if re.search(re_match, tokens[count-1]) is not None:
                            firstBestSynHit.title()

                        if word.endswith('.'):
                            firstBestSynHit += '.'
                        elif word.endswith('?'):
                            firstBestSynHit += '?'
                        elif word.endswith('!'):
                            firstBestSynHit += '!'
                        elif word.endswith(','):
                            firstBestSynHit += ','
                        elif word.endswith(';'):
                            firstBestSynHit += ';'
                        elif word.endswith(':'):
                            firstBestSynHit += ':'

                        # later: Randomly choose one of the synonyms that have all the highest rating
                       # tokens[count]       = '<b style="color:#FF99FF; text-decoration: underline" title="SynRank(' \
                        #                      + firstBestSynHitRank + ') /LocalMysqL DB/ ->Synonym
                        # ausgetauscht!"><i>' + deumlaut(firstBestSynHit) + '</i></b>'

                        tokens[count] = onetipp.deumlaut(firstBestSynHit)

                        noDoubleHash.add(firstBestSynHit)
                        tokensRaw[count] = onetipp.deumlaut(firstBestSynHit)
                        changeEveryWordFlag = 1
                        changeEveryWordTemp += 1
                        #break

            except IndexError:
                1

# file schreiben
outputtext          = ' '.join(tokens)
outputtextRaw       = ' '.join(tokensRaw)

readabilityVar      = str(textstat.flesch_reading_ease(outputtextRaw))

with codecs.open(outputfile, 'w') as f:
    f.write(outputtext )
   # f.write("<span title=\"Flesch Reading Ease: (Grosser Wert=Einfacher zu lesen ### geringer Wert=Schwerer zu
   # "lesen)\">Lesbarkeitswert : </span>" + readabilityVar)
    #f.write("<br><br>")
    #f.write(outputtext)
    #f.write("<br><br>")
    #f.write("RUSSISCHE TRANSLITERATION: BEISPIEL VERSION")
    #f.write("<br><br>")
    #f.write(translit(outputtextRaw, 'ru'))
    f.close()

onetipp.mysql.commit()
onetipp.mysql.close()
exit(0)


"""
The Flesch Reading Ease formula

function name - flesch_reading_ease(text)

returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document.

90-100 : Very Easy
80-89 : Easy
70-79 : Fairly Easy
60-69 : Standard
50-59 : Fairly Difficult
30-49 : Difficult
0-29 : Very Confusing

"""