# -*- coding: utf-8 -*-
#!/usr/bin/python3.5 -S
"""
A naive keyword extractor which just pulls out nouns and noun phrases.

Was using the PerceptronTagger is _way_ faster than NLTK's default tagger, and more accurate to boot.
See <http://stevenloria.com/tutorial-state-of-the-art-part-of-speech-tagging-in-textblob/>.

However, it complicates the library's installation, and the spacy tagger is quite fast and good too.

https://github.com/frnsys/broca/blob/master/broca/common/util.py

"""
from textblob_de import TextBlobDE
#import spacy
import codecs

CFG = {
    ('NNP', 'NNP'): 'NNP',
    ('NN', 'NN'): 'NNI',
    ('NNI', 'NN'): 'NNI',
    ('JJ', 'JJ'): 'JJ',
    ('JJ', 'NN'): 'NNI',
}

def check_term(tdoc, term):
    if term not in tdoc:
        return False

    # If this term occurs outside of a phrase,
    # it is no longer a candidate
    n = tdoc.count(term)
    # Count phrases that contain t
    d = sum(1 for ph in tdoc if term != ph and term in ph)
    return n > d

def check_phrase(phrase, term):
    return term in phrase

def gram_size(term):
    """
    Convenience func for getting n-gram length.
    """
    return len(term.split(' '))

def prune(tdocs):
	"""
	Prune terms which are totally subsumed by a phrase
	This could be better if it just removes the individual keywords
	that occur in a phrase for each time that phrase occurs.
	"""
	all_terms = set([t for toks in tdocs for t in toks])
	terms = set()
	phrases = set()
	for t in all_terms:
		e4 = codecs.encode(t, 'utf-8')
		if gram_size(t) > 1:
			phrases.add(e4)
		else:
			terms.add(e4)

	# Identify candidates for redundant terms (1-gram terms found in a phrase)
	redundant = set()
	for t in terms:
		if any(t in ph for ph in phrases):
			redundant.add(t)

	# Search all documents to check that these terms occur
	# only in a phrase. If not, remove it as a candidate.
	# This could be more efficient
	cleared = set()
	for t in redundant:
		if any(check_term(d, term=t) for d in tdocs):
			cleared.add(t)

	redundant = redundant.difference(cleared)

	pruned_tdocs = []
	for doc in tdocs:
		pruned_tdocs.append([t for t in doc if t not in redundant])

	return pruned_tdocs

def tokenize(docs, nlp):
	lst 		= docs.split(' ')
	d_list 		= []
	
	for word in lst:
		#word = word.lower()
		d_list.append(word.encode())
		#print(type(word.encode()))
	
	docs 		= b' '.join(d_list)
	docs 		= str(docs)
	#docs 		= ' '.join(map(str, d_list))
	#print(str(docs))
	tags 		= ['NN', 'NNS', 'NNP', 'NNPS']
	keywords 	= []
	###nlp 		= spacy.load('de', tagger=True, parser=True, entity=True)
	###nlp 		= spacy.de.German()
	toks 		= nlp(docs)
	#print(toks)
	#blob 		= TextBlobDE(docs)
	#toks 		= blob.tags
	#tagged 		= [(t[0].lower().strip(), t[1]) for t in toks]
	tagged = [(t.lower_.strip(), t.tag_) for t in toks]
	#print(tagged)
	kws 		= [t for t, tag in tagged if tag in tags]
	kws 		+= extract_noun_phrases(tagged)
	keywords.append(kws)
	'''
	for doc in docs:
		toks = spacy(doc, tag=True, parse=False, entity=False)
		tagged = [(t.lower_.strip(), t.tag_) for t in toks]
		kws = [t for t, tag in tagged if tag in tags]
		kws += extract_noun_phrases(tagged)
		keywords.append(kws)
	'''
	return prune(keywords)

def extract_noun_phrases(tagged_doc):
    """
    (From textblob)
    """
    tags = _normalize_tags(tagged_doc)
    merge = True
    while merge:
        merge = False
        for x in range(0, len(tags) - 1):
            t1 = tags[x]
            t2 = tags[x + 1]
            key = t1[1], t2[1]
            value = CFG.get(key, '')
            if value:
                merge = True
                tags.pop(x)
                tags.pop(x)
                match = '%s %s' % (t1[0], t2[0])
                pos = value
                tags.insert(x, (match, pos))
                break

    matches = [t[0] for t in tags if t[1] in ['NNP', 'NNI']]
    return matches


def _normalize_tags(chunk):
	ret = []
	for word, tag in chunk:
		#word = word.encode('utf-8', 'ignore')
		if tag == 'NP-TL' or tag == 'NP':
			ret.append((word, 'NNP'))
			continue
		if tag.endswith('-TL'):
			ret.append((word, tag[:-3]))
			continue
		if tag.endswith('S'):
			ret.append((word, tag[:-1]))
			continue
		ret.append((word, tag))
		    
	return ret