#!/usr/bin/env python # -*- coding: utf-8 -*- """Defines classes related to mapping vocabulary to n-dimensional points.""" from io import open import logging from os import path import tarfile import numpy as np from numpy import float32 from six import PY2 from six import text_type as unicode from six import iteritems from six.moves import map from six import string_types from six.moves import cPickle as pickle from .base import CountedVocabulary, OrderedVocabulary from ..utils import _open logger = logging.getLogger(__name__) class Embedding(object): """ Mapping a vocabulary to a d-dimensional points.""" def __init__(self, vocabulary, vectors): self.vocabulary = vocabulary self.vectors = np.asarray(vectors) if len(self.vocabulary) != self.vectors.shape[0]: raise ValueError("Vocabulary has {} items but we have {} " "vectors".format(len(vocabulary), self.vectors.shape[0])) def __getitem__(self, k): return self.vectors[self.vocabulary[k]] def __contains__(self, k): return k in self.vocabulary def __delitem__(self, k): """Remove the word and its vector from the embedding. Note: This operation costs \\theta(n). Be careful putting it in a loop. """ index = self.vocabulary[k] del self.vocabulary[k] self.vectors = np.delete(self.vectors, index, 0) def __len__(self): return len(self.vocabulary) def __iter__(self): for w in self.vocabulary: yield w, self[w] @property def words(self): return self.vocabulary.words @property def shape(self): return self.vectors.shape def apply_expansion(self, expansion): """Apply a vocabulary expansion to the current emebddings.""" self.vocabulary = expansion(self.vocabulary) def get(self, k, default=None): try: return self[k] except KeyError as e: return default def most_frequent(self, k, inplace=False): """Only most frequent k words to be included in the embeddings.""" vocabulary = self.vocabulary.most_frequent(k) vectors = np.asarray([self[w] for w in vocabulary]) if inplace: self.vocabulary = vocabulary self.vectors = vectors return self return Embedding(vectors=vectors, vocabulary=vocabulary) def normalize_words(self, ord=2, inplace=False): """Normalize embeddings matrix row-wise. Args: ord: normalization order. Possible values {1, 2, 'inf', '-inf'} """ if ord == 2: ord = None # numpy uses this flag to indicate l2. vectors = self.vectors.T / np.linalg.norm(self.vectors, ord, axis=1) if inplace: self.vectors = vectors.T return self return Embedding(vectors=vectors.T, vocabulary=self.vocabulary) def nearest_neighbors(self, word, top_k=10): """Return the nearest k words to the given `word`. Args: word (string): single word. top_k (integer): decides how many neighbors to report. Returns: A list of words sorted by the distances. The closest is the first. Note: L2 metric is used to calculate distances. """ #TODO(rmyeid): Use scikit ball tree, if scikit is available point = self[word] diff = self.vectors - point distances = np.linalg.norm(diff, axis=1) top_ids = distances.argsort()[1:top_k+1] return [self.vocabulary.id_word[i] for i in top_ids] def distances(self, word, words): """Calculate eucledean pairwise distances between `word` and `words`. Args: word (string): single word. words (list): list of strings. Returns: numpy array of the distances. Note: L2 metric is used to calculate distances. """ point = self[word] vectors = np.asarray([self[w] for w in words]) diff = vectors - point distances = np.linalg.norm(diff, axis=1) return distances @staticmethod def from_gensim(model): word_count = {} vectors = [] for word, vocab in sorted(iteritems(model.vocab), key=lambda item: -item[1].count): vectors.append(model.syn0[vocab.index]) word_count[word] = vocab.count vocab = CountedVocabulary(word_count=word_count) vectors = np.asarray(vectors) return Embedding(vocabulary=vocab, vectors=vectors) @staticmethod def from_word2vec_vocab(fvocab): counts = {} with _open(fvocab) as fin: for line in fin: word, count = unicode(line).strip().split() counts[word] = int(count) return CountedVocabulary(word_count=counts) @staticmethod def _from_word2vec_binary(fname): with _open(fname, 'rb') as fin: words = [] header = unicode(fin.readline()) vocab_size, layer1_size = list(map(int, header.split())) # throws for invalid file format vectors = np.zeros((vocab_size, layer1_size), dtype=float32) binary_len = np.dtype(float32).itemsize * layer1_size for line_no in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch != b'\n': # ignore newlines in front of words (some binary files have newline, some don't) word.append(ch) word = b''.join(word) index = line_no words.append(word) vectors[index, :] = np.fromstring(fin.read(binary_len), dtype=float32) return words, vectors @staticmethod def _from_word2vec_text(fname): with _open(fname, 'rb') as fin: words = [] header = unicode(fin.readline()) vocab_size, layer1_size = list(map(int, header.split())) # throws for invalid file format vectors = [] for line_no, line in enumerate(fin): try: parts = unicode(line, encoding="utf-8").strip().split() except TypeError as e: parts = line.strip().split() except Exception as e: logger.warning("We ignored line number {} because of erros in parsing" "\n{}".format(line_no, e)) continue # We differ from Gensim implementation. # Our assumption that a difference of one happens because of having a # space in the word. if len(parts) == layer1_size + 1: word, weights = parts[0], list(map(float32, parts[1:])) elif len(parts) == layer1_size + 2: word, weights = parts[:2], list(map(float32, parts[2:])) word = u" ".join(word) else: logger.warning("We ignored line number {} because of unrecognized " "number of columns {}".format(line_no, parts[:-layer1_size])) continue index = line_no words.append(word) vectors.append(weights) vectors = np.asarray(vectors, dtype=np.float32) return words, vectors @staticmethod def from_word2vec(fname, fvocab=None, binary=False): """ Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. `binary` is a boolean indicating whether the data is in binary word2vec format. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). """ vocabulary = None if fvocab is not None: logger.info("loading word counts from %s" % (fvocab)) vocabulary = Embedding.from_word2vec_vocab(fvocab) logger.info("loading projection weights from %s" % (fname)) if binary: words, vectors = Embedding._from_word2vec_binary(fname) else: words, vectors = Embedding._from_word2vec_text(fname) if not vocabulary: vocabulary = OrderedVocabulary(words=words) return Embedding(vocabulary=vocabulary, vectors=vectors) @staticmethod def load(fname): """Load an embedding dump generated by `save`""" content = _open(fname).read() if PY2: state = pickle.loads(content) else: state = pickle.loads(content, encoding='latin1') voc, vec = state if len(voc) == 2: words, counts = voc word_count = dict(zip(words, counts)) vocab = CountedVocabulary(word_count=word_count) else: vocab = OrderedVocabulary(voc) return Embedding(vocabulary=vocab, vectors=vec) def save(self, fname): """Save a pickled version of the embedding into `fname`.""" vec = self.vectors voc = self.vocabulary.getstate() state = (voc, vec) with open(fname, 'wb') as f: pickle.dump(state, f, protocol=pickle.HIGHEST_PROTOCOL)