llm_moderation_testing / ui /tab_testing.py
Yacine Jernite
working with org token
bc0c2e4
"""Testing tab UI components."""
import os
import sys
import gradio as gr
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import json
from utils.constants import MODELS, TEST_EXAMPLES
from utils.model_interface import extract_model_id, get_model_info
from utils.helpers import check_token_availability
def parse_json_response(response: str) -> dict:
"""Parse JSON response, handling code blocks."""
response = response.strip()
try:
if "```json" in response:
response = response.split("```json")[1].split("```")[0]
elif "```" in response:
response = response.split("```")[1].split("```")[0]
return json.loads(response)
except json.JSONDecodeError:
return {"label": -1, "categories": []}
def format_model_info(model_choice, reasoning_effort) -> str:
"""Format model information markdown."""
if not model_choice:
return "*Select a model in Configuration tab*"
model_id = extract_model_id(model_choice)
if not model_id:
return "*Select a model in Configuration tab*"
model_info = get_model_info(model_id)
if not model_info:
return f"*Model: {model_id}*"
model_name = model_info.get("name", model_id)
is_thinking = model_info.get("is_thinking", False)
supports_reasoning_level = model_info.get("supports_reasoning_level", False)
# Handle None or invalid reasoning_effort
reasoning_effort_val = reasoning_effort if reasoning_effort else "Low"
info_lines = [
f"**Model:** {model_name}",
f"- **Thinking Model:** {'Yes' if is_thinking else 'No'}",
f"- **Supports Reasoning Level:** {'Yes' if supports_reasoning_level else 'No'}",
]
if supports_reasoning_level:
info_lines.append(f"- **Reasoning Effort:** {reasoning_effort_val}")
return "\n".join(info_lines)
def format_reasoning_info(model_choice, reasoning_text) -> tuple[str, bool]:
"""Format reasoning info markdown and visibility."""
if not model_choice:
return "", False
model_id = extract_model_id(model_choice)
model_info = get_model_info(model_id)
if not model_info:
return "", False
is_thinking = model_info.get("is_thinking", False)
# For non-thinking models, always show the message
if not is_thinking:
return "*This model does not provide reasoning traces.*", True
# For thinking models, only show info if there's no reasoning text
if not reasoning_text or not reasoning_text.strip():
return "", False
return "", False
def format_save_mode_help(has_personal: bool, has_org: bool) -> str:
"""
Format help text explaining save mode options.
Args:
has_personal: Whether personal token is available
has_org: Whether org token is available
Returns:
Help text string
"""
lines = []
if not has_personal and not has_org:
lines.append("*⚠️ No tokens available. Please log in or set tokens to save results.*")
else:
if has_org:
lines.append("*✅ ROOST Dataset: Available (org token set)*")
else:
lines.append("*❌ ROOST Dataset: Requires org token (HACKATHON_INFERENCE_TOKEN)*")
if has_personal:
lines.append("*✅ Private Dataset: Available (personal token set)*")
else:
lines.append("*❌ Private Dataset: Requires personal token (OAuth login or .env)*")
return "\n".join(lines)
def format_test_result(result: dict) -> tuple[str, dict, str, str, str]:
"""
Format test result for display.
Returns:
Tuple of (label_text, parsed_json, categories_text, reasoning_text, raw_response)
"""
raw_content = result.get("content", "")
parsed = parse_json_response(raw_content)
label = parsed.get("label", -1)
categories = parsed.get("categories", [])
label_text = (
"## ❌ Policy Violation Detected" if label == 1
else "## ✅ No Policy Violation" if label == 0
else "## ⚠️ Unable to determine label"
)
if categories and len(categories) > 0:
cat_text = "### Categories:\n\n"
for cat in categories:
category_name = cat.get('category', 'Unknown')
reasoning_text = cat.get('reasoning', 'No reasoning provided')
policy_source = cat.get('policy_source', '')
cat_text += f"- **Category:** {category_name}\n"
cat_text += f" - **Explanation:** {reasoning_text}\n"
if policy_source:
cat_text += f" - **Policy Source:** {policy_source}\n"
cat_text += "\n\n"
else:
cat_text = "*No categories found in response*\n\n"
cat_text += "This output expects a valid JSON response, as specified for example in the default prompt.\n\n"
cat_text += "The raw response can be seen in the Model Response section below."
reasoning = result.get("reasoning", "")
# Format raw response for display
raw_response_text = f"```\n{raw_content}\n```"
return label_text, parsed, cat_text, reasoning or "", raw_response_text
def build_testing_tab() -> dict:
"""Build the testing tab UI and set up simple handlers."""
with gr.Tab("🧪 Testing"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Input")
with gr.Group():
test_input = gr.Textbox(label="Test Content", placeholder="Enter content to test...", lines=5)
example_dropdown = gr.Dropdown(label="Load Example", choices=list(TEST_EXAMPLES.keys()), value=None)
load_example_btn = gr.Button("Load Example", variant="secondary")
run_test_btn = gr.Button("Run Test", variant="primary")
save_mode = gr.Radio(
label="Save to Dataset",
choices=["Don't Save", "Save to ROOST Dataset", "Save to Private Dataset"],
value="Don't Save"
)
# Initialize help text based on token availability
has_personal, has_org = check_token_availability(None)
save_mode_help = gr.Markdown(
value=format_save_mode_help(has_personal, has_org),
visible=True
)
# Initialize with default model info
initial_model = f"{MODELS[0]['name']} ({MODELS[0]['id']})"
initial_info_lines = [
f"**Model:** {MODELS[0]['name']}",
f"- **Thinking Model:** {'Yes' if MODELS[0]['is_thinking'] else 'No'}",
f"- **Supports Reasoning Level:** {'Yes' if MODELS[0]['supports_reasoning_level'] else 'No'}",
]
if MODELS[0]['supports_reasoning_level']:
initial_info_lines.append("- **Reasoning Effort:** Low")
model_info_display = gr.Markdown(value="\n".join(initial_info_lines))
with gr.Column(scale=2):
gr.Markdown("### Results")
label_display = gr.Markdown(value="*Run a test to see results*")
with gr.Accordion("Categories & Reasoning", open=True):
categories_display = gr.Markdown(value="*No categories yet*")
with gr.Accordion("Model Response", open=False):
model_response_display = gr.Markdown(value="*No response yet*")
with gr.Accordion("Reasoning Trace", open=False):
reasoning_info = gr.Markdown(value="", visible=False)
reasoning_display = gr.Code(label="", language=None, value="", visible=False)
# Simple handlers that don't need cross-tab coordination
load_example_btn.click(
lambda name: TEST_EXAMPLES.get(name, ""),
inputs=example_dropdown,
outputs=test_input,
)
return {
"test_input": test_input,
"example_dropdown": example_dropdown,
"load_example_btn": load_example_btn,
"run_test_btn": run_test_btn,
"save_mode": save_mode,
"save_mode_help": save_mode_help,
"model_info_display": model_info_display,
"label_display": label_display,
"categories_display": categories_display,
"model_response_display": model_response_display,
"reasoning_info": reasoning_info,
"reasoning_display": reasoning_display,
}