Source code for mindnlp.metrics.rouge

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
""""Classes for Metrics RougeN and RougeL"""

import numpy as np
from mindnlp.abc import Metric
from .utils import _check_value_type


def _get_ngrams(words, n_size=1):
    """
    Calculates n-gram for multiple sentences.
    """
    ngram_set = set()
    max_start = len(words) - n_size
    for i in range(max_start + 1):
        ngram_set.add(tuple(words[i:i + n_size]))
    return ngram_set

def _lcs(strg, sub):
    """
    Calculates the length of longest common subsequence of strg and sub.

    Args:
        strg (list): The string to be calculated, usually longer the sub string.
        sub (list): The sub string to be calculated.

    Returns:
        - **length** (float) - The length of the longest common subsequence
                                of string and sub.
    """
    if len(strg) < len(sub):
        sub, strg = strg, sub
    lengths = np.zeros((len(strg) + 1, len(sub) + 1))
    for j in range(1, len(sub) + 1):
        for i in range(1, len(strg) + 1):
            if strg[i - 1] == sub[j - 1]:
                lengths[i][j] = lengths[i - 1][j - 1] + 1
            else:
                lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])

    length = lengths[len(strg)][len(sub)]
    return length


[docs]def rouge_n_fn(cand_list, ref_list, n_size=1): r""" Calculates the ROUGE-N score. ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used for evaluating automatic summarization and machine translation models. ROUGE-N refers to the overlap of n-grams between candidates and reference summaries. Args: cand_list (list): A list of tokenized candidate sentences. ref_list (list): A list of lists of tokenized true sentences. n_size (int): N_gram value. Default: 1. Returns: - **rougen_score** (float) - The computed result. Raises: RuntimeError: If the reference size is 0. Example: >>> from mindnlp.common.metrics import rouge_n >>> cand_list = ["the", "cat", "was", "found", "under", "the", "bed"] >>> ref_list = [["the", "cat", "was", "under", "the", "bed"]] >>> rougen_score = rouge_n(cand_list, ref_list, 2) >>> print(rougen_score) 0.8 """ cand_list = _check_value_type("cand_list", cand_list, list) ref_list = _check_value_type("ref_list", ref_list, list) n_size = _check_value_type("n_size", n_size, [int]) overlap_count = 0 ref_count = 0 cand_ngrams = _get_ngrams(cand_list, n_size) for reference in ref_list: ref_ngrams = _get_ngrams(reference, n_size) ref_count += len(ref_ngrams) # Gets the overlapping ngrams between evaluated and reference overlap_ngrams = cand_ngrams.intersection(ref_ngrams) overlap_count += len(overlap_ngrams) if ref_count == 0: raise RuntimeError(f'ROUGE-N can not be calculated, because the number of references is {0}') rougen_score = overlap_count / ref_count return rougen_score
[docs]def rouge_l_fn(cand_list, ref_list, beta=1.2): r""" Calculates the ROUGE-L score. ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used for evaluating automatic summarization and machine translation models. ROUGE-L is calculated based on Longest Common Subsequence (LCS). The function is shown as follows: .. math:: R_{l c s}=\frac{L C S(X, Y)}{m} p_{l c s}=\frac{L C S(X, Y)}{n} F_{l c s}=\frac{\left(1+\beta^{2}\right) R_{l c s} P_{l c s}}{R_{l c s}+\beta^{2} P_{l c s}} where `X` is the candidate sentence, `Y` is the reference sentence. `m` and `n` represent the length of `X` and `Y` respectively. `LCS` means the longest common subsequence. Args: cand_list (list): A list of tokenized candidate sentence. ref_list (list): A list of lists of tokenized true sentences. beta (float): A hyperparameter to decide the weight of recall. Defaults: 1.2. Returns: - **rougel_score** (float) - The computed result. Example: >>> from mindnlp.common.metrics import rouge_l >>> cand_list = ["The","cat","The","cat","on","the","mat"] >>> ref_list = [["The","cat","is","on","the","mat"], ["There","is","a","cat","on","the","mat"]] >>> rougel_score = rouge_l(cand_list, ref_list) >>> print(rougel_score) 0.7800511508951408 """ cand_list = _check_value_type("cand_list", cand_list, list) ref_list = _check_value_type("ref_list", ref_list, list) beta = _check_value_type("beta", beta, [float]) inst_scores = [] precs, recalls = [], [] for ref in ref_list: basic_lcs = _lcs(cand_list, ref) prec = basic_lcs / len(cand_list) if cand_list is not None else 0. rec = basic_lcs / len(ref) if ref is not None else 0. precs.append(prec) recalls.append(rec) prec_max = max(precs) rec_max = max(recalls) if prec_max != 0 and rec_max != 0: score = ((1 + beta**2) * prec_max * rec_max) / \ float(rec_max + beta**2 * prec_max) else: score = 0.0 inst_scores.append(score) rougel_score = 1. * sum(inst_scores) / len(inst_scores) return rougel_score
[docs]class RougeN(Metric): r""" Calculates the ROUGE-N. ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used for evaluating automatic summarization and machine translation models. ROUGE-N refers to the overlap of n-grams between candidates and reference summaries. Args: n_size (int): N_gram value. Default: 1. name (str): Name of the metric. Example: >>> from mindnlp.common.metrics import RougeN >>> cand_list = ["the", "cat", "was", "found", "under", "the", "bed"] >>> ref_list = [["the", "cat", "was", "under", "the", "bed"]] >>> metric = RougeN(2) >>> metric.update(cand_list, ref_list) >>> rougen_score = metric.eval() >>> print(rougen_score) 0.8 """ def __init__(self, n_size=1, name='RougeN'): super().__init__() self._name = name self.n_size = _check_value_type("n_size", n_size, [int]) self.overlap_count = 0 self.ref_count = 0
[docs] def clear(self): """Clears the internal evaluation results.""" self.overlap_count = 0 self.ref_count = 0
[docs] def update(self, *inputs): """ Updates local variables. Args: inputs: Input `cand_list` and `ref_list`. - cand_list (list): A list of tokenized candidate sentence. - ref_list (list): A list of lists of tokenized ground truth sentences. Raises: ValueError: If the number of inputs is not 2. """ if len(inputs) != 2: raise ValueError(f'For `RougeN.update`, it needs 2 inputs (`cand_list` and `ref_list`),' f' but got {len(inputs)}.') cand_list = inputs[0] ref_list = inputs[1] cand_list = _check_value_type("cand_list", cand_list, list) ref_list = _check_value_type("ref_list", ref_list, list) cand_ngrams = _get_ngrams(cand_list, self.n_size) for reference in ref_list: ref_ngrams = _get_ngrams(reference, self.n_size) self.ref_count += len(ref_ngrams) # Gets the overlapping ngrams between evaluated and reference overlap_ngrams = cand_ngrams.intersection(ref_ngrams) self.overlap_count += len(overlap_ngrams)
[docs] def eval(self): """ Computes and returns the Rouge-N score. Returns: - **rougen_score** (float) - The computed result. Raises: RuntimeError: If the reference size is 0. """ if self.ref_count == 0: raise RuntimeError(f'ROUGE-N can not be calculated, because the number of references is {0}') rougen_score = self.overlap_count / self.ref_count return rougen_score
[docs] def get_metric_name(self): """ Returns the name of the metric. """ return self._name
[docs]class RougeL(Metric): r""" Calculates the ROUGE-L score. ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used for evaluating automatic summarization and machine translation models. ROUGE-L is calculated based on Longest Common Subsequence (LCS). The function is shown as follows: .. math:: R_{l c s}=\frac{L C S(X, Y)}{m} p_{l c s}=\frac{L C S(X, Y)}{n} F_{l c s}=\frac{\left(1+\beta^{2}\right) R_{l c s} P_{l c s}}{R_{l c s}+\beta^{2} P_{l c s}} where `X` is the candidate sentence, `Y` is the reference sentence. `m` and `n` represent the length of `X` and `Y` respectively. `LCS` means the longest common subsequence. Args: beta (float): A hyperparameter to decide the weight of recall. Defaults: 1.2. name (str): Name of the metric. Example: >>> from mindnlp.common.metrics import RougeL >>> cand_list = ["The","cat","The","cat","on","the","mat"] >>> ref_list = [["The","cat","is","on","the","mat"], ["There","is","a","cat","on","the","mat"]] >>> metric = RougeL() >>> metric.update(cand_list, ref_list) >>> rougel_score = metric.eval() >>> print(rougel_score) 0.7800511508951408 """ def __init__(self, beta=1.2, name='RougeL'): super().__init__() self._name = name self.beta = _check_value_type("beta", beta, [float]) self.inst_scores = []
[docs] def clear(self): """Clears the internal evaluation results.""" self.inst_scores = []
[docs] def update(self, *inputs): """ Updates local variables. Args: inputs: Input `cand_list` and `ref_list`. cand_list (list): A list of tokenized candidate sentence. ref_list (list): A list of lists of tokenized ground truth sentences. Raises: ValueError: If the number of inputs is not 2. """ if len(inputs) != 2: raise ValueError(f'For `RougeL.update`, it needs 2 inputs (`cand_list` and `ref_list`),' f' but got {len(inputs)}.') cand_list = inputs[0] ref_list = inputs[1] cand_list = _check_value_type("cand_list", cand_list, list) ref_list = _check_value_type("ref_list", ref_list, list) precs, recalls = [], [] for ref in ref_list: basic_lcs = _lcs(cand_list, ref) prec = basic_lcs / len(cand_list) if cand_list is not None else 0. rec = basic_lcs / len(ref) if ref is not None else 0. precs.append(prec) recalls.append(rec) prec_max = max(precs) rec_max = max(recalls) if prec_max != 0 and rec_max != 0: score = ((1 + self.beta**2) * prec_max * rec_max) / \ float(rec_max + self.beta**2 * prec_max) else: score = 0.0 self.inst_scores.append(score)
[docs] def eval(self): """ Computes and returns the Rouge-L score. Returns: - **rougel_score** (float) - The computed result. """ rougel_score = 1. * sum(self.inst_scores) / len(self.inst_scores) return rougel_score
[docs] def get_metric_name(self): """ Returns the name of the metric. """ return self._name
__all__ = ['rouge_n_fn', 'rouge_l_fn', 'RougeL', 'RougeN']