Source code for embedeval.parsers.word2vec_simple

"""
embedeval
~~~~~~~~~

NLP Embedding Evaluation Tool

:copyright: (c) 2019 by Timo Furrer <tuxtimo@gmail.com>
:license: MIT, see LICENSE for more details.
"""

from typing import List, Tuple
from pathlib import Path

import numpy as np

from embedeval.errors import EmbedevalError
from embedeval.embedding import WordEmbedding


[docs]class SimpleWordEmbedding(WordEmbedding): """Represents a word2vec specific Word Embedding This Word Embedding should only be used for small datasets as it's purely implemented in Python and therefore somewhat slow. """ def __init__(self, path, word_vectors): self._path = path self.word_vectors = word_vectors @property def path(self) -> Path: return self._path @property def shape(self) -> Tuple[int, int]: return (len(self.word_vectors), self.word_vectors.values()[0].size)
[docs] def get_words(self) -> List[str]: return list(self.word_vectors.keys())
[docs] def get_word_vector(self, word: str) -> np.array: return self.word_vectors[word]
def load_embedding(path: Path) -> SimpleWordEmbedding: """Load the given Word2Vec Word Embedding The format for the Embedding expects the n x m matrix size in the first row of the text file. The current implementation fails, if that's not the case. """ with open(path, "r", encoding="utf-8") as word2vec_file: header_line = word2vec_file.readline() try: word_size, word_vector_size = [int(x) for x in header_line.split()] except ValueError as exc: if "not enough" in str(exc): raise EmbedevalError( "The given Embedding file doesn't contain the N x M " "Embedding size in the header line" ) elif "too many" in str(exc): raise EmbedevalError( "The given Embedding file has too many values in the header line" ) elif "invalid literal" in str(exc): raise EmbedevalError( "The header line must contain two integers " f"for the size but does: '{header_line}'" ) else: raise EmbedevalError( "Unable to extract N x M Embedding size form the header line" ) from exc word_vectors = {} for word_number, line in enumerate(word2vec_file): word, *raw_word_vector = line.split() word_vector = [np.float32(x) for x in raw_word_vector] if len(word_vector) != word_vector_size: raise EmbedevalError( f"Promised word vector size {word_vector_size} from header " f"wasn't matched on line {word_number + 2} with a size of {len(word_vector)}" ) word_vectors[word] = word_vector if len(word_vectors) < word_size: raise EmbedevalError( f"Promised word size {word_size} from header " f"wasn't matched with a size of {len(word_vectors)}" ) return SimpleWordEmbedding(path, word_vectors)