"""Dataset utilities for saving and loading test results.""" from datetime import datetime from datasets import Dataset, load_dataset from huggingface_hub import HfApi from utils.model_interface import extract_model_id, get_model_info def get_username_from_token(token: str | None) -> str: """ Get username from Hugging Face token using whoami. Args: token: HF token string or None Returns: Username string, or "yjernite" as fallback if token is None or whoami fails """ if token is None: return "yjernite" try: api = HfApi() user_info = api.whoami(token=token) return user_info.get("name", "yjernite") except Exception: return "yjernite" def get_dataset_repo_id(token: str | None) -> str: """ Get dataset repository ID for the current user. Args: token: HF token string or None Returns: Dataset repo ID in format "{username}/moderation-test-results" """ username = get_username_from_token(token) return f"{username}/moderation-test-results" def get_roost_dataset_repo_id() -> str: """Get ROOST org dataset repository ID.""" return "roosttools/moderation-test-results" def load_dataset_from_hub(repo_id: str, token: str | None) -> tuple[list[dict], Exception | None]: """ Load dataset from Hub and return list of examples. Args: repo_id: Dataset repository ID token: HF token string or None (None allows public dataset access) Returns: Tuple of (list of example dicts, error Exception or None if successful) """ try: # Use load_dataset - more standard way to load from Hub dataset_dict = load_dataset(repo_id, token=token) # Get the default split (usually 'train' or first split) dataset = dataset_dict[list(dataset_dict.keys())[0]] # Convert to list of dicts examples = dataset.to_list() return examples, None except FileNotFoundError: # Dataset doesn't exist yet return [], None except Exception as e: # Other errors (network, auth, etc.) - return error return [], e def format_categories_and_reasoning(parsed: dict) -> str: """ Format categories and reasoning from parsed JSON response. Args: parsed: Parsed JSON dict with 'categories' key Returns: Formatted markdown string """ categories = parsed.get("categories", []) if categories and len(categories) > 0: cat_text = "### Categories:\n\n" for cat in categories: category_name = cat.get('category', 'Unknown') reasoning_text = cat.get('reasoning', 'No reasoning provided') policy_source = cat.get('policy_source', '') cat_text += f"- **Category:** {category_name}\n" cat_text += f" - **Explanation:** {reasoning_text}\n" if policy_source: cat_text += f" - **Policy Source:** {policy_source}\n" cat_text += "\n\n" return cat_text else: return "*No categories found in response*\n\nThis output expects a valid JSON response, as specified for example in the default prompt.\n\nThe raw response can be seen in the Model Response section below." def save_to_dataset(repo_id: str, token: str | None, data: dict) -> tuple[bool, str]: """ Save test result to Hugging Face dataset. Args: repo_id: Dataset repository ID (e.g., "username/moderation-test-results" or "roosttools/moderation-test-results") token: HF token string or None data: Dict with all test result fields Returns: Tuple of (success: bool, message: str) """ try: # Load existing dataset and examples using shared function examples, load_error = load_dataset_from_hub(repo_id, token) # If there was an error loading (other than FileNotFoundError), raise it if load_error is not None: raise load_error # Append new example examples.append(data) # Create new dataset with all examples dataset = Dataset.from_list(examples) # Push to hub (private by default) dataset.push_to_hub(repo_id, token=token, private=True) return True, f"Saved to {repo_id}" except FileNotFoundError: # Dataset doesn't exist yet, create new one try: dataset = Dataset.from_list([data]) dataset.push_to_hub(repo_id, token=token, private=True) return True, f"Saved to {repo_id}" except Exception as e: return False, f"Failed to create new dataset: {str(e)}" except Exception as e: return False, f"Failed to save: {str(e)}" def load_dataset_examples(repo_id: str, token: str | None) -> tuple[list[dict], list[str]]: """ Load examples from Hugging Face dataset. Args: repo_id: Dataset repository ID token: HF token string or None (None allows public dataset access) Returns: Tuple of (list of example dicts, list of formatted dropdown labels) """ # Use shared loading function examples, load_error = load_dataset_from_hub(repo_id, token) # If there was an error loading, return empty lists if load_error is not None: return [], [] if not examples: return [], [] # Format dropdown labels labels = [] for idx, example in enumerate(examples): input_text = example.get("input", "") model_selection = example.get("model_selection", "") policy_violation = example.get("policy_violation", -1) # Get label emoji if policy_violation == 1: label_emoji = "❌" elif policy_violation == 0: label_emoji = "✅" else: label_emoji = "⚠️" # Extract model name model_id = extract_model_id(model_selection) model_info = get_model_info(model_id) if model_id else None model_name = model_info.get("name", model_id) if model_info else model_id or "Unknown" # Truncate input for label input_preview = input_text[:40] + "..." if len(input_text) > 40 else input_text label = f"{input_preview} - {label_emoji} - {model_name} - #{idx}" labels.append(label) return examples, labels