"""
A naive keyword extractor which just pulls out nouns and noun phrases.

Was using the PerceptronTagger is _way_ faster than NLTK's default tagger, and more accurate to boot.
See <http://stevenloria.com/tutorial-state-of-the-art-part-of-speech-tagging-in-textblob/>.

However, it complicates the library's installation, and the spacy tagger is quite fast and good too.
"""

import spacy

CFG = {
    ('NNP', 'NNP'): 'NNP',
    ('NN', 'NN'): 'NNI',
    ('NNI', 'NN'): 'NNI',
    ('JJ', 'JJ'): 'JJ',
    ('JJ', 'NN'): 'NNI',
}


class POSTokenizer(Tokenizer):
    
	def prune(tdocs):
		"""
		Prune terms which are totally subsumed by a phrase
		This could be better if it just removes the individual keywords
		that occur in a phrase for each time that phrase occurs.
		"""
		all_terms = set([t for toks in tdocs for t in toks])
		terms = set()
		phrases = set()
		for t in all_terms:
			if gram_size(t) > 1:
				phrases.add(t)
			else:
				terms.add(t)

		# Identify candidates for redundant terms (1-gram terms found in a phrase)
		redundant = set()
		for t in terms:
			if any(t in ph for ph in phrases):
				redundant.add(t)

		# Search all documents to check that these terms occur
		# only in a phrase. If not, remove it as a candidate.
		# This could be more efficient
		cleared = set()
		for t in redundant:
			if any(check_term(d, term=t) for d in tdocs):
				cleared.add(t)

		redundant = redundant.difference(cleared)

		pruned_tdocs = []
		for doc in tdocs:
			pruned_tdocs.append([t for t in doc if t not in redundant])

		return pruned_tdocs
	
	def tokenize(self, docs):
        tags = ['NN', 'NNS', 'NNP', 'NNPS']

        keywords = []
        for doc in docs:
            toks = spacy(doc, tag=True, parse=False, entity=False)
            tagged = [(t.lower_.strip(), t.tag_) for t in toks]
            kws = [t for t, tag in tagged if tag in tags]
            kws += extract_noun_phrases(tagged)
            keywords.append(kws)
        return prune(keywords)


def extract_noun_phrases(tagged_doc):
    """
    (From textblob)
    """
    tags = _normalize_tags(tagged_doc)
    merge = True
    while merge:
        merge = False
        for x in range(0, len(tags) - 1):
            t1 = tags[x]
            t2 = tags[x + 1]
            key = t1[1], t2[1]
            value = CFG.get(key, '')
            if value:
                merge = True
                tags.pop(x)
                tags.pop(x)
                match = '%s %s' % (t1[0], t2[0])
                pos = value
                tags.insert(x, (match, pos))
                break

    matches = [t[0] for t in tags if t[1] in ['NNP', 'NNI']]
    return matches


def _normalize_tags(chunk):
    """
    (From textblob)

    Normalize the corpus tags.
    ("NN", "NN-PL", "NNS") -> "NN"
    """
    ret = []
    for word, tag in chunk:
        if tag == 'NP-TL' or tag == 'NP':
            ret.append((word, 'NNP'))
            continue
        if tag.endswith('-TL'):
            ret.append((word, tag[:-3]))
            continue
        if tag.endswith('S'):
            ret.append((word, tag[:-1]))
            continue
        ret.append((word, tag))
    return ret