Source code for mindnlp.modules.embeddings.fasttext_embedding

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Fasttext_embedding"""

import os
import re
import json
import logging
from itertools import islice
import numpy as np
from mindspore import ops
from mindspore import Tensor
from mindnlp.utils import cache_file, unzip
from mindnlp.abc.modules.embedding import TokenEmbedding
from mindnlp.configs import DEFAULT_ROOT
from mindnlp._legacy.nn import Dropout

JSON_FILENAME = 'fasttext_hyper.json'
EMBED_FILENAME = 'fasttext.txt'
logging.getLogger().setLevel(logging.INFO)


[docs]class Fasttext(TokenEmbedding):
    r"""
    Embedding layer.

    Args:
        init_embed (Tensor): Passing into Tensor,use these values to initialize Embedding directly.
        requires_grad (bool): Whether this parameter needs to be gradient to update. Default: True.
        dropout (float): Dropout of the output of Embedding. Default: 0.5.

    Examples:
        >>> init_embed = Tensor(np.zeros((4, 4)).astype(np.float32))
        >>> fasttext_embed = Fasttext(init_embed)
        >>> ids = Tensor([1, 2, 3])
        >>> output = fasttext_embed(ids)

    """
    urls = {
        "1M": "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip",
        "1M-subword": "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M-subword.vec.zip",
    }

    dims = [300]

    def __init__(self, init_embed, requires_grad: bool = True, dropout=0.0):
        super().__init__(init_embed)

        self._embed_len = init_embed.shape[0]
        self.embed = init_embed
        self._embed_dim = init_embed.shape[1]
        self._embed_size = init_embed.shape
        self.requires_grad = requires_grad
        self.dropout_layer = Dropout(p=dropout)
        self.dropout_p = dropout

[docs]    @classmethod
    def from_pretrained(cls, name='1M', dims=300, root=DEFAULT_ROOT, special_first=True, **kwargs):
        r"""
        Creates Embedding instance from given pre-trained word vector.

        Args:
            name (str): The name of the pretrained vector. Default: "1M".
            dims (int): The dimension of the pretrained vector. Default: 300.
            root (str): Default storage directory. Default: DEFAULT_ROOT.
            special_first (bool): Indicates whether special participles from special_tokens will be added to
                the top of the dictionary. If True, add special_tokens to the beginning of the dictionary,
                otherwise add them to the end. Default: True.
            kwargs (dict):
                - requires_grad (bool): Whether this parameter needs to be gradient to update.
                - dropout (float): Dropout of the output of Embedding.

        Returns:
            - Fasttext, Returns an embedding instance generated through a pretrained word vector.

        """
        if name not in cls.urls:
            raise ValueError(f"The argument 'name' must in {cls.urls.keys()}, but got {name}.")
        if dims not in cls.dims:
            raise ValueError(f"The argument 'dims' must in {cls.dims}, but got {dims}.")
        cache_dir = os.path.join(root, "embeddings", "Fasttext")

        url = cls.urls[name]
        download_file_name = re.sub(r".+/", "", url)
        fasttext_file_name = f"wiki-news-{dims}d-{name}.vec"
        path, _ = cache_file(filename=download_file_name, cache_dir=cache_dir, url=url)
        decompress_path = os.path.join(cache_dir, fasttext_file_name)
        if not os.path.exists(decompress_path):
            unzip(path, cache_dir)

        fasttext_file_path = os.path.join(cache_dir, fasttext_file_name)

        embeddings = []
        with open(fasttext_file_path, encoding='utf-8') as file:
            for line in islice(file, 1, None):
                _, embedding = line.split(maxsplit=1)
                embeddings.append(np.fromstring(embedding, dtype=np.float32, sep=' '))

        if special_first:
            embeddings.insert(0, np.random.rand(dims))
            embeddings.insert(1, np.zeros((dims,), np.float32))
        else:
            embeddings.append(np.random.rand(dims))
            embeddings.append(np.zeros((dims,), np.float32))

        embeddings = np.array(embeddings).astype(np.float32)

        requires_grad = kwargs.get('requires_grad', True)
        dropout = kwargs.get('dropout', 0.0)

        return cls(Tensor(embeddings), requires_grad, dropout)

[docs]    def construct(self, ids):
        r"""

        Args:
            ids (Tensor): Ids to query.

        Returns:
            - Tensor, returns the Embedding query results.

        """
        out_shape = ids.shape + (self._embed_dim,)
        flat_ids = ids.reshape((-1,))
        output_for_reshape = ops.gather(self.embed, flat_ids, 0)
        output = ops.reshape(output_for_reshape, out_shape)
        return self.dropout(output)

[docs]    def save(self, foldername, root=DEFAULT_ROOT):
        r"""
        Save the embedding to the specified location.

        Args:
            foldername (str): Name of the folder to store.
            root (Path): Path of the embedding folder. Default: DEFAULT_ROOT.

        Returns:
            None

        """
        folder = os.path.join(root, 'embeddings', 'Fasttext', 'save', foldername)
        os.makedirs(folder, exist_ok=True)

        embed = self.embed
        embed_list = embed
        nums = self._embed_len
        dims = self._embed_dim

        kwargs = {}
        kwargs['dropout'] = kwargs.get('dropout', self.dropout_p)
        kwargs['requires_grad'] = kwargs.get('requires_grad', self.requires_grad)

        with open(os.path.join(folder, JSON_FILENAME), 'w', encoding='utf-8') as file:
            json.dump(kwargs, file, indent=2)

        with open(os.path.join(folder, EMBED_FILENAME), 'w', encoding='utf-8') as file:
            file.write(f'{" " * 30}\n')
            for i in range(0, nums):
                embed_write = list(embed_list[i])
                vec_write = ' '.join(map(str, embed_write))
                file.write(f'{vec_write}\n')
            file.seek(0)
            file.write(f'{nums} {dims}')

        logging.info('Embedding has been saved to %s', folder)

[docs]    @classmethod
    def load(cls, foldername=None, root=DEFAULT_ROOT, load_npy=False, npy_path=None):
        r"""
        Load embedding from the specified location.

        Args:
            foldername (str): Name of the folder to load. Default: None.
            root (Path): Path of the embedding folder. Default: DEFAULT_ROOT.
            load_npy (Bool): Whether to initialize the embedding as a npy file. Npy_path are valid
                when load_npy is True. Default: False.
            npy_path (Path): Location of the npy file. Default: None.

        Returns:
            None

        """

        if load_npy:
            load_embed = np.load(npy_path)

            return cls(Tensor(load_embed))

        folder = os.path.join(root, 'embeddings', 'Fasttext', 'save', foldername)
        for name in [JSON_FILENAME, EMBED_FILENAME]:
            assert os.path.exists(os.path.join(folder, name)), f"{name} not found in {folder}."

        with open(os.path.join(folder, JSON_FILENAME), 'r', encoding='utf-8') as file:
            hyper = json.load(file)

        embeddings = []
        with open(os.path.join(folder, EMBED_FILENAME), encoding='utf-8') as file:
            file.readline()
            for line in file:
                embedding = line.rstrip('\n')
                embeddings.append(np.fromstring(embedding, dtype=np.float32, sep=' '))

        embeddings = np.array(embeddings).astype(np.float32)

        logging.info("Load embedding from %s", folder)

        return cls(Tensor(embeddings), **hyper)