Source code for embedeval.parsers.word2vec_gensim
"""
embedeval
~~~~~~~~~
NLP Embedding Evaluation Tool
:copyright: (c) 2019 by Timo Furrer <tuxtimo@gmail.com>
:license: MIT, see LICENSE for more details.
"""
from typing import List, Tuple
from pathlib import Path
import numpy as np
from gensim.models import KeyedVectors
from embedeval.embedding import WordEmbedding
from embedeval.errors import EmbedevalError
[docs]class KeyedVectorsWordEmbedding(WordEmbedding):
"""Represents a word2vec KeyedVectors specific Word Embedding
The word2vec file will be parsed by ``gensim``.
The gensim ``KeyedVectors`` instance is made available
in the ``self.keyed_vectors`` attribute.
"""
def __init__(self, path, keyed_vectors):
self._path = path
#: Holds the gensim KeyedVectors instance
self.keyed_vectors = keyed_vectors
@property
def path(self) -> Path:
return self._path # pragma: no cover
@property
def shape(self) -> Tuple[int, int]:
return (len(self.keyed_vectors.vectors), self.keyed_vectors.vector_size)
[docs] def get_words(self) -> List[str]:
return list(self.keyed_vectors.vocab.keys())
[docs] def get_word_vector(self, word: str) -> np.array:
return self.keyed_vectors.word_vec(word)
def load_embedding(path: Path, binary=False) -> KeyedVectorsWordEmbedding:
"""Load the given Word2Vec Word Embedding using gensim
The ``gensim.load_word2vec_format`` function is used to parse
the word2vec Embdding file.
The ``gensim.models.keyedvectors.KeyedVectors`` is wrapped in the
embedeval specific ``WordEmbedding`` object.
"""
try:
keyed_vectors = KeyedVectors.load_word2vec_format(
path, binary=binary, unicode_errors="ignore"
)
except Exception as exc:
raise EmbedevalError(
f"Failed to parse Embedding with gensim KeyedVectors: {exc}"
)
return KeyedVectorsWordEmbedding(path, keyed_vectors)