# Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm # as described in: # Rose, S., D. Engel, N. Cramer, and W. Cowley (2010). # Automatic keyword extraction from indi-vidual documents. # In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd. # # NOTE: The original code (from https://github.com/aneesha/RAKE) # has been extended by a_medelyan (zelandiya) # with a set of heuristics to decide whether a phrase is an acceptable candidate # as well as the ability to set frequency and phrase length parameters # important when dealing with longer documents from __future__ import absolute_import from __future__ import print_function import re import operator import six import nltk from six.moves import range debug = False test = False def is_number(s): try: float(s) if '.' in s else int(s) return True except ValueError: return False def load_stop_words(stop_word_file): """ Utility function to load stop words from a file and return as a list of words @param stop_word_file Path and file name of a file containing stop words. @return list A list of stop words. """ stop_words = [] for line in open(stop_word_file): if line.strip()[0:1] != "#": for word in line.split(): # in case more than one per line stop_words.append(word) return stop_words def separate_words(text, min_word_return_size): """ Utility function to return a list of all words that are have a length greater than a specified number of characters. @param text The text that must be split in to words. @param min_word_return_size The minimum no of characters a word must have to be included. """ splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]') words = [] for single_word in splitter.split(text): current_word = single_word.strip().lower() #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word): words.append(current_word) return words def split_sentences(text): """ Utility function to return a list of sentences. @param text The text that must be split in to sentences. """ sentences = nltk.sent_tokenize(text) #sentence_delimiters = re.compile(u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]') #sentences = sentence_delimiters.split(text) return sentences def build_stop_word_regex(stop_word_file_path): stop_word_list = load_stop_words(stop_word_file_path) stop_word_regex_list = [] for word in stop_word_list: word_regex = '\\b' + word + '\\b' stop_word_regex_list.append(word_regex) stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE) return stop_word_pattern def generate_candidate_keywords(sentence_list, stopword_pattern, min_char_length=1, max_words_length=5): phrase_list = [] for s in sentence_list: tmp = re.sub(stopword_pattern, '|', s.strip()) phrases = tmp.split("|") for phrase in phrases: phrase = phrase.strip().lower() if phrase != "" and is_acceptable(phrase, min_char_length, max_words_length): phrase_list.append(phrase) return phrase_list def is_acceptable(phrase, min_char_length, max_words_length): # a phrase must have a min length in characters if len(phrase) < min_char_length: return 0 # a phrase must have a max number of words words = phrase.split() if len(words) > max_words_length: return 0 digits = 0 alpha = 0 for i in range(0, len(phrase)): if phrase[i].isdigit(): digits += 1 elif phrase[i].isalpha(): alpha += 1 # a phrase must have at least one alpha character if alpha == 0: return 0 # a phrase must have more alpha than digits characters if digits > alpha: return 0 return 1 def calculate_word_scores(phraseList): word_frequency = {} word_degree = {} for phrase in phraseList: word_list = separate_words(phrase, 0) word_list_length = len(word_list) word_list_degree = word_list_length - 1 #if word_list_degree > 3: word_list_degree = 3 #exp. for word in word_list: word_frequency.setdefault(word, 0) word_frequency[word] += 1 word_degree.setdefault(word, 0) word_degree[word] += word_list_degree #orig. #word_degree[word] += 1/(word_list_length*1.0) #exp. for item in word_frequency: word_degree[item] = word_degree[item] + word_frequency[item] # Calculate Word scores = deg(w)/frew(w) word_score = {} for item in word_frequency: word_score.setdefault(item, 0) word_score[item] = word_degree[item] / (word_frequency[item] * 1.0) #orig. #word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp. return word_score def generate_candidate_keyword_scores(phrase_list, word_score, min_keyword_frequency=1): keyword_candidates = {} for phrase in phrase_list: if min_keyword_frequency > 1: if phrase_list.count(phrase) < min_keyword_frequency: continue keyword_candidates.setdefault(phrase, 0) word_list = separate_words(phrase, 0) candidate_score = 0 for word in word_list: candidate_score += word_score[word] keyword_candidates[phrase] = candidate_score return keyword_candidates class Rake(object): def __init__(self, stop_words_path, min_char_length=1, max_words_length=5, min_keyword_frequency=1): self.__stop_words_path = stop_words_path self.__stop_words_pattern = build_stop_word_regex(stop_words_path) self.__min_char_length = min_char_length self.__max_words_length = max_words_length self.__min_keyword_frequency = min_keyword_frequency def run(self, text): sentence_list = split_sentences(text) phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern, self.__min_char_length, self.__max_words_length) word_scores = calculate_word_scores(phrase_list) keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores, self.__min_keyword_frequency) sorted_keywords = sorted(six.iteritems(keyword_candidates), key=operator.itemgetter(1), reverse=True) return sorted_keywords if test: text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types." # Split text into sentences sentenceList = split_sentences(text) #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1 stoppath = "RAKE/SmartStoplist.txt" #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1 stopwordpattern = build_stop_word_regex(stoppath) # generate candidate keywords phraseList = generate_candidate_keywords(sentenceList, stopwordpattern) # calculate individual word scores wordscores = calculate_word_scores(phraseList) # generate candidate keyword scores keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores) if debug: print(keywordcandidates) sortedKeywords = sorted(six.iteritems(keywordcandidates), key=operator.itemgetter(1), reverse=True) if debug: print(sortedKeywords) totalKeywords = len(sortedKeywords) if debug: print(totalKeywords) print(sortedKeywords[0:(totalKeywords // 3)]) rake = Rake("SmartStoplist.txt") keywords = rake.run(text) print(keywords)