Spaces:
Running
Running
| # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """Accuracy metric for the Test of Time benchmark by Bahar et al. (2025).""" | |
| import ast | |
| import json | |
| from typing import Literal | |
| import datasets | |
| import evaluate | |
| _CITATION = """\ | |
| @InProceedings{huggingface:module, | |
| title = {Test of Time Accuracy}, | |
| authors={Auss Abbood}, | |
| year={2025} | |
| } | |
| """ | |
| _DESCRIPTION = """\ | |
| The Test of Time (ToT) benchmarks expects models format their answers as a JSON with an explanation field and an answer field that follows a predefined format. The metrics extracts JSONs objects from the model's output, retains only the first JSON, drops the explanation field and compares it with the reference answer. | |
| """ | |
| # TODO: Add description of the arguments of the module here | |
| _KWARGS_DESCRIPTION = """ | |
| Compares the extracted answer from the model's output with the reference answer. | |
| Args: | |
| predictions: list of predictions to score. Each predictions | |
| should be a string with tokens separated by spaces. | |
| references: list of reference for each prediction. Each | |
| reference should be a string with tokens separated by spaces. | |
| Returns: | |
| accuracy: description of the first score, | |
| another_score: description of the second score, | |
| Examples: | |
| Examples should be written in doctest format, and should illustrate how | |
| to use the function. | |
| >>> my_new_module = evaluate.load("my_new_module") | |
| >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1]) | |
| >>> print(results) | |
| {'accuracy': 1.0} | |
| """ | |
| class TestOfTimeAccuracy(evaluate.Metric): | |
| """Accuracy metric for the Test of Time benchmark by Bahar et al. (2025).""" | |
| __test__ = False | |
| def _info(self): | |
| return evaluate.MetricInfo( | |
| module_type="metric", | |
| description=_DESCRIPTION, | |
| citation=_CITATION, | |
| inputs_description=_KWARGS_DESCRIPTION, | |
| # This defines the format of each prediction and reference | |
| features=datasets.Features( | |
| { | |
| "predictions": datasets.Value("string"), | |
| "references": datasets.Value("string"), | |
| } | |
| ), | |
| # Homepage of the module for documentation | |
| # homepage="http://module.homepage", | |
| # Additional links to the codebase or references | |
| # codebase_urls=["http://github.com/path/to/codebase/of/new_module"], | |
| # reference_urls=["http://path.to.reference.url/new_module"], | |
| ) | |
| def _extract_first_json_object(s: str) -> dict | None: | |
| decoder = json.JSONDecoder() | |
| idx, end = 0, len(s) | |
| while idx < end: | |
| try: | |
| obj, next_idx = decoder.raw_decode(s, idx) | |
| idx = next_idx | |
| if isinstance(obj, dict): | |
| return obj | |
| except ValueError: | |
| idx += 1 | |
| return None | |
| def _pop_explanation(d): | |
| if isinstance(d, dict): | |
| d.pop("explanation", None) | |
| return d | |
| def _get_answer(d): | |
| if isinstance(d, dict): | |
| return d.get("answer", None) | |
| return d | |
| def _parse_label(s): | |
| """Parses a string that could be a JSON object or a Python dict.""" | |
| try: | |
| return json.loads(s) | |
| except json.JSONDecodeError: | |
| try: | |
| # Safe: only parses literals, does not execute code | |
| return ast.literal_eval(s) | |
| except (ValueError, SyntaxError): | |
| return None | |
| def _compute( | |
| self, | |
| predictions, | |
| references, | |
| subset: Literal["arithmetic", "semantic"], | |
| return_average: bool = True, | |
| ): | |
| """Returns the scores""" | |
| predictions = [self._extract_first_json_object(p) for p in predictions] | |
| if subset == "semantic": | |
| predictions = [self._get_answer(p) for p in predictions] | |
| elif subset == "arithmetic": | |
| predictions = [self._pop_explanation(p) for p in predictions] | |
| references = [self._parse_label(r) for r in references] | |
| else: | |
| raise ValueError(f"Invalid subset: {subset}") | |
| accuracy = [i == j for i, j in zip(predictions, references)] | |
| if return_average: | |
| return {"accuracy": sum(accuracy) / len(accuracy)} | |
| return {"accuracy": accuracy} | |