Source code for mindnlp.workflow.works.sentiment_analysis

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
# pylint:disable=invalid-name
"""
Sentiment Analysis Work
"""

# pylint:disable=invalid-name,line-too-long

import os

import mindspore
from mindspore import Tensor
from mindspore.dataset import text
from mindspore.ops import functional as F
from mindnlp._legacy.functional import argmax
from mindnlp.workflow.work import Work
from mindnlp.workflow.downstream import BertForSentimentAnalysis
from mindnlp.models import BertConfig
from mindnlp.transforms import PadTransform
from mindnlp.transforms.tokenizers import BertTokenizer

usage = r"""
    from mindnlp import Workflow
    
    senta = Workflow("sentiment_analysis")
    senta("个产品用起来真的很流畅，我非常喜欢")
    ...
    [{'text': '这个产品用起来真的很流畅，我非常喜欢', 'label': 'positive', 'score': 0.9953349232673645}]
    ...
"""


[docs]class SentimentAnalysisWork(Work):
    """
    Sentiment Analysis Work.
    """

    resource_files_names = {
        "model_state": "mbert_for_senta_model_state.ckpt",
        "vocab": "bert_for_senta_vocab.txt",
    }
    resource_files_urls = {
        "bert": {
            "vocab": [
                "https://download.mindspore.cn/toolkits/mindnlp/workflow/sentiment_analysis/bert_for_senta_vocab.txt",
                "3b5b76c4aef48ecf8cb3abaafe960f09",
            ],
            "model_state": [
                "https://download.mindspore.cn/toolkits/mindnlp/workflow/sentiment_analysis/bert_for_senta_model_state.ckpt",
                "7dba7b0371d2fcbb053e28c8bdfb1050",
            ],
        }
    }

    def __init__(self, work, model, **kwargs):
        super().__init__(model, work, **kwargs)
        self._label_map = {0: "negative", 1: "neutral", 2: "positive"}
        self._check_work_files()
        self._construct_tokenizer(model)
        self._construct_model(model)
        self._usage = usage

    def _construct_model(self, model):
        """
        Construct the model.
        """
        vocab_size = self.kwargs["vocab_size"]
        num_classes = 3

        config = BertConfig(vocab_size=vocab_size, num_labels=num_classes)
        model_instance = BertForSentimentAnalysis(config)

        model_path = os.path.join(
            self._work_path, "model_state", "bert_for_senta_model_state.ckpt"
        )
        state_dict = mindspore.load_checkpoint(model_path)
        mindspore.load_param_into_net(model_instance, state_dict)

        self._model = model_instance
        self._model.set_train(False)

    def _construct_tokenizer(self, model):
        """
        Construct the tokenizer.
        """
        vocab_path = os.path.join(self._work_path, "vocab", "bert_for_senta_vocab.txt")
        vocab = text.Vocab.from_file(vocab_path)

        vocab_size = len(vocab.vocab())
        pad_token_id = vocab.tokens_to_ids("[PAD]")

        self.kwargs["pad_token_id"] = pad_token_id
        self.kwargs["vocab_size"] = vocab_size
        tokenizer = BertTokenizer(vocab)
        self._tokenizer = tokenizer

    def _preprocess(self, inputs, padding=True, add_special_tokens=True):
        """
        Preprocess the inputs.
        """
        # Get the config from the kwargs
        batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1

        examples = []
        filter_inputs = []
        for input_data in inputs:
            if not (isinstance(input_data, str) and len(input_data) > 0):
                continue
            filter_inputs.append(input_data)
            ids = self._tokenizer.execute_py(input_data)
            lens = len(ids)
            examples.append((ids, lens))

        batches = [
            examples[idx : idx + batch_size]
            for idx in range(0, len(examples), batch_size)
        ]
        outputs = {}
        outputs["text"] = filter_inputs
        outputs["data_loader"] = batches

        return outputs

    def _batchify_fn(self, samples):
        seq_list = [sample[1] for sample in samples]
        max_length = max(seq_list)
        outputs = []
        pader = PadTransform(
            max_length=max_length, pad_value=self.kwargs["pad_token_id"]
        )
        for sample in samples:
            outputs.append(pader(sample[0]))
        return Tensor(outputs)

    def _run_model(self, inputs):
        """
        Run the model.
        """
        results = []
        scores = []
        for batch in inputs["data_loader"]:
            ids = self._batchify_fn(batch)
            outputs = self._model(ids)
            probs = F.softmax(outputs, axis=-1)
            idx = argmax(probs, dim=-1).asnumpy().tolist()
            if isinstance(idx, int):
                idx = [idx]
            score = [max(prob.asnumpy().tolist()) for prob in probs]
            labels = [self._label_map[i] for i in idx]
            results.extend(labels)
            scores.extend(score)

        inputs["result"] = results
        inputs["score"] = scores
        return inputs

    def _postprocess(self, inputs):
        """
        Postprocess the outputs.
        """
        final_results = []
        for _text, label, score in zip(
            inputs["text"], inputs["result"], inputs["score"]
        ):
            result = {}
            result["text"] = _text
            result["label"] = label
            result["score"] = score
            final_results.append(result)
        return final_results