# -*- coding: utf-8 -*- #!/usr/bin/python3.5 -S """ A naive keyword extractor which just pulls out nouns and noun phrases. Was using the PerceptronTagger is _way_ faster than NLTK's default tagger, and more accurate to boot. See . However, it complicates the library's installation, and the spacy tagger is quite fast and good too. https://github.com/frnsys/broca/blob/master/broca/common/util.py """ from textblob_de import TextBlobDE #import spacy import codecs CFG = { ('NNP', 'NNP'): 'NNP', ('NN', 'NN'): 'NNI', ('NNI', 'NN'): 'NNI', ('JJ', 'JJ'): 'JJ', ('JJ', 'NN'): 'NNI', } def check_term(tdoc, term): if term not in tdoc: return False # If this term occurs outside of a phrase, # it is no longer a candidate n = tdoc.count(term) # Count phrases that contain t d = sum(1 for ph in tdoc if term != ph and term in ph) return n > d def check_phrase(phrase, term): return term in phrase def gram_size(term): """ Convenience func for getting n-gram length. """ return len(term.split(' ')) def prune(tdocs): """ Prune terms which are totally subsumed by a phrase This could be better if it just removes the individual keywords that occur in a phrase for each time that phrase occurs. """ all_terms = set([t for toks in tdocs for t in toks]) terms = set() phrases = set() for t in all_terms: e4 = codecs.encode(t, 'utf-8') if gram_size(t) > 1: phrases.add(e4) else: terms.add(e4) # Identify candidates for redundant terms (1-gram terms found in a phrase) redundant = set() for t in terms: if any(t in ph for ph in phrases): redundant.add(t) # Search all documents to check that these terms occur # only in a phrase. If not, remove it as a candidate. # This could be more efficient cleared = set() for t in redundant: if any(check_term(d, term=t) for d in tdocs): cleared.add(t) redundant = redundant.difference(cleared) pruned_tdocs = [] for doc in tdocs: pruned_tdocs.append([t for t in doc if t not in redundant]) return pruned_tdocs def tokenize(docs, nlp): lst = docs.split(' ') d_list = [] for word in lst: #word = word.lower() d_list.append(word.encode()) #print(type(word.encode())) docs = b' '.join(d_list) docs = str(docs) #docs = ' '.join(map(str, d_list)) #print(str(docs)) tags = ['NN', 'NNS', 'NNP', 'NNPS'] keywords = [] ###nlp = spacy.load('de', tagger=True, parser=True, entity=True) ###nlp = spacy.de.German() toks = nlp(docs) #print(toks) #blob = TextBlobDE(docs) #toks = blob.tags #tagged = [(t[0].lower().strip(), t[1]) for t in toks] tagged = [(t.lower_.strip(), t.tag_) for t in toks] #print(tagged) kws = [t for t, tag in tagged if tag in tags] kws += extract_noun_phrases(tagged) keywords.append(kws) ''' for doc in docs: toks = spacy(doc, tag=True, parse=False, entity=False) tagged = [(t.lower_.strip(), t.tag_) for t in toks] kws = [t for t, tag in tagged if tag in tags] kws += extract_noun_phrases(tagged) keywords.append(kws) ''' return prune(keywords) def extract_noun_phrases(tagged_doc): """ (From textblob) """ tags = _normalize_tags(tagged_doc) merge = True while merge: merge = False for x in range(0, len(tags) - 1): t1 = tags[x] t2 = tags[x + 1] key = t1[1], t2[1] value = CFG.get(key, '') if value: merge = True tags.pop(x) tags.pop(x) match = '%s %s' % (t1[0], t2[0]) pos = value tags.insert(x, (match, pos)) break matches = [t[0] for t in tags if t[1] in ['NNP', 'NNI']] return matches def _normalize_tags(chunk): ret = [] for word, tag in chunk: #word = word.encode('utf-8', 'ignore') if tag == 'NP-TL' or tag == 'NP': ret.append((word, 'NNP')) continue if tag.endswith('-TL'): ret.append((word, tag[:-3])) continue if tag.endswith('S'): ret.append((word, tag[:-1])) continue ret.append((word, tag)) return ret