Spaces:
Sleeping
Sleeping
| # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """Evaluation metric for the TimeBench temporal reasoning benchmark.""" | |
| import re | |
| from datetime import datetime | |
| import datasets | |
| import evaluate | |
| from dateutil import parser | |
| from dateutil.parser import ParserError | |
| _CITATION = """\ | |
| @software{abbood2026timebench_eval, | |
| title={TimeBench Eval}, | |
| author={Abbood, Auss}, | |
| year={2026}, | |
| url={https://huggingface.co/spaces/aauss/timebench_eval} | |
| } | |
| """ | |
| _DESCRIPTION = """\ | |
| Evaluation metric for the TimeBench benchmark, which assesses temporal reasoning | |
| abilities in large language models. Supports multiple task types including TempReason, | |
| TimeQA, MenatQA, Date Arithmetic, and TimeDial. | |
| """ | |
| _KWARGS_DESCRIPTION = """ | |
| Calculates evaluation metrics for temporal reasoning tasks. | |
| Args: | |
| predictions: list of prediction strings from the model. Each prediction | |
| should contain the marker "Thus, the correct answer is:" followed by the answer. | |
| references: list of reference answer strings. | |
| task: the task type, one of "TempReason", "TimeQA", "MenatQA", "Date Arithmetic", or "TimeDial". | |
| Returns: | |
| exact_match: list of exact match scores (0 or 1) for each prediction. | |
| f1: list of F1 scores for each prediction (for applicable tasks). | |
| Examples: | |
| >>> timebench_eval = evaluate.load("aauss/timebench_eval") | |
| >>> predictions = ["Let me think... Thus, the correct answer is: Aug, 1987."] | |
| >>> references = ["Aug, 1987"] | |
| >>> results = timebench_eval.compute(predictions=predictions, references=references, task="Date Arithmetic") | |
| >>> print(results) | |
| {'exact_match': [1]} | |
| """ | |
| class TimebenchEval(evaluate.Metric): | |
| """Evaluation metric for TimeBench temporal reasoning tasks.""" | |
| def __init__(self, *args, **kwargs): | |
| super().__init__(*args, **kwargs) | |
| self.squad_metric = evaluate.load("squad") | |
| def _info(self): | |
| return evaluate.MetricInfo( | |
| module_type="metric", | |
| description=_DESCRIPTION, | |
| citation=_CITATION, | |
| inputs_description=_KWARGS_DESCRIPTION, | |
| features=datasets.Features( | |
| { | |
| "predictions": datasets.Value("string"), | |
| "references": datasets.Value("string"), | |
| } | |
| ), | |
| homepage="https://huggingface.co/spaces/aauss/timebench_eval", | |
| codebase_urls=["https://huggingface.co/spaces/aauss/timebench_eval/tree/main"], | |
| reference_urls=["https://huggingface.co/datasets/ulab-ai/Time-Bench"], | |
| ) | |
| def _compute( | |
| self, predictions: list[str], references: list[str], task: str | |
| ) -> dict[str, list[float]]: | |
| """ | |
| Compute evaluation metrics for the given predictions and references. | |
| Args: | |
| predictions: List of prediction strings to evaluate. | |
| references: List of reference strings to compare against. | |
| task: Task type, one of: "TempReason", "TimeQA", "MenatQA", "Date Arithmetic", "TimeDial". | |
| Returns: | |
| Dictionary containing metric scores (exact_match and/or f1) as lists of floats. | |
| """ | |
| if task in [ | |
| "TempReason", | |
| "TimeQA", | |
| "MenatQA", | |
| ]: | |
| return self._call_squad(predictions, references) | |
| elif task == "Date Arithmetic": | |
| return self._compare_dates(predictions, references) | |
| elif task == "TimeDial": | |
| return self._compute_timedial(predictions, references) | |
| else: | |
| raise ValueError( | |
| f"Unknown task: {task}. Expected one of: TempReason, TimeQA, MenatQA, Date Arithmetic, TimeDial" | |
| ) | |
| def _extract_answer(response: str) -> str | None: | |
| """Extract the answer from the response""" | |
| marker = "Thus, the correct answer is:" | |
| if marker not in response: | |
| return None | |
| answer = response.split(marker)[-1] | |
| # Take only the first line (stops at newlines if model continues) | |
| answer = answer.strip().split("\n")[0] | |
| answer = answer.rstrip(".!?").strip() | |
| if "unanswerable" in answer.lower(): | |
| return "unanswerable" | |
| return answer or None | |
| def _extract_selected_options(self, text: str) -> set[str]: | |
| """ | |
| Extract selected option letters (A, B, C, D) from various formats: | |
| - "B, C" | |
| - "B and C" | |
| - "B & C" | |
| - "B && C" | |
| - "B. No more than ten minutes && C. No more than five minutes" | |
| - "Options B and C" | |
| - "The answer is B, C" | |
| """ | |
| if not text: | |
| return set() | |
| # Pattern matches option letters that appear: | |
| # 1. At word boundary followed by period, comma, space, &, or end: \b[A-D](?=[.\s,&]|$) | |
| # 2. This avoids matching letters inside words like "CAD" or "BAD" | |
| # Find all A, B, C, D that look like option selections | |
| # They should be at a word boundary and followed by typical delimiters | |
| pattern = r"\b([A-D])(?:\.|,|\s|&|$)" | |
| matches = re.findall(pattern, text) | |
| return set(matches) | |
| def _call_squad( | |
| self, predictions: list[str], references: list[str] | |
| ) -> dict[str, list[float]]: | |
| """ | |
| Compute SQuAD metrics (Exact Matchand F1) for predictions and references. | |
| Args: | |
| predictions: List of prediction strings. | |
| references: List of reference answer strings. | |
| Returns: | |
| Dictionary with "exact_match" and "f1" keys, each containing a list of scores. | |
| """ | |
| exact_matches = [] | |
| f1_scores = [] | |
| for i, (pred, ref) in enumerate(zip(predictions, references)): | |
| formatted_pred = [ | |
| {"id": "0", "prediction_text": self._extract_answer(pred) or ""} | |
| ] | |
| formatted_ref = [ | |
| {"id": "0", "answers": {"text": [ref], "answer_start": [0]}} | |
| ] | |
| results = self.squad_metric.compute( | |
| predictions=formatted_pred, references=formatted_ref | |
| ) | |
| exact_matches.append(results["exact_match"] / 100) | |
| f1_scores.append(results["f1"] / 100) | |
| return { | |
| "exact_match": exact_matches, | |
| "f1": f1_scores, | |
| } | |
| def _compare_dates( | |
| self, predictions: list[str], references: list[str] | |
| ) -> dict[str, list[int]]: | |
| """ | |
| Parses and compares dates in predictions and references for exact match. | |
| Args: | |
| predictions: List of prediction strings containing dates. | |
| references: List of reference date strings. | |
| Returns: | |
| Dictionary with "exact_match" key containing a list of 0/1 scores. | |
| """ | |
| predictions = [ | |
| self._parse_historical_date(self._extract_answer(pred)) | |
| for pred in predictions | |
| ] | |
| references = [self._parse_historical_date(ref) for ref in references] | |
| return { | |
| "exact_match": [ | |
| 1 if pred == ref else 0 for pred, ref in zip(predictions, references) | |
| ], | |
| } | |
| def _compute_timedial( | |
| self, predictions: list[str], references: list[str] | |
| ) -> dict[str, list[float]]: | |
| """ | |
| Compute TimeDial metrics (Exact Match and F1) using set-based comparison of selected options. | |
| Args: | |
| predictions: List of prediction strings. | |
| references: List of reference strings containing selected options. | |
| Returns: | |
| Dictionary with "exact_match" and "f1" keys, each containing a list of scores. | |
| """ | |
| exact_matches = [] | |
| f1_scores = [] | |
| for pred, ref in zip(predictions, references): | |
| pred_answer = self._extract_answer(pred) # Get text after marker | |
| pred_options = ( | |
| self._extract_selected_options(pred_answer) if pred_answer else set() | |
| ) | |
| ref_options = self._extract_selected_options(ref) | |
| # Exact match: sets must be identical | |
| em = 1 if pred_options == ref_options else 0 | |
| exact_matches.append(em) | |
| # F1: set-based | |
| if not pred_options and not ref_options: | |
| f1 = 1.0 # Both empty = perfect match | |
| elif not pred_options or not ref_options: | |
| f1 = 0.0 # One empty, one not | |
| else: | |
| tp = len(pred_options & ref_options) | |
| precision = tp / len(pred_options) | |
| recall = tp / len(ref_options) | |
| f1 = ( | |
| 2 * precision * recall / (precision + recall) | |
| if (precision + recall) > 0 | |
| else 0.0 | |
| ) | |
| f1_scores.append(f1) | |
| return {"exact_match": exact_matches, "f1": f1_scores} | |
| def _parse_historical_date(date_str: str | None) -> datetime | None: | |
| """ | |
| Parse a date string and return a datetime object with day set to 1. | |
| Args: | |
| date_str: String representation of a date, or None. | |
| Returns: | |
| datetime object with day set to 1, or None if parsing fails or input is None. | |
| """ | |
| if date_str is None: | |
| return None | |
| try: | |
| return parser.parse(date_str).replace(day=1) | |
| except ParserError: | |
| return None | |