Yacine Jernite
commited on
Commit
·
06103c4
1
Parent(s):
338aa78
load dataset
Browse files- README.md +3 -0
- app.py +126 -1
- requirements.txt +1 -0
- ui/tab_policy.py +11 -0
- ui/tab_testing.py +6 -0
README.md
CHANGED
|
@@ -11,6 +11,9 @@ license: apache-2.0
|
|
| 11 |
short_description: A model to test different models assessing content policies
|
| 12 |
hf_oauth: true
|
| 13 |
hf_oauth_scopes:
|
|
|
|
|
|
|
|
|
|
| 14 |
- inference-api
|
| 15 |
---
|
| 16 |
|
|
|
|
| 11 |
short_description: A model to test different models assessing content policies
|
| 12 |
hf_oauth: true
|
| 13 |
hf_oauth_scopes:
|
| 14 |
+
- read-repos
|
| 15 |
+
- write-repos
|
| 16 |
+
- manage-repos
|
| 17 |
- inference-api
|
| 18 |
---
|
| 19 |
|
app.py
CHANGED
|
@@ -7,10 +7,14 @@ import gradio as gr
|
|
| 7 |
|
| 8 |
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
from utils.helpers import get_hf_token
|
| 11 |
from utils.model_interface import extract_model_id, run_test
|
| 12 |
from ui.sidebar import build_sidebar
|
| 13 |
from ui.tab_config import build_config_tab
|
|
|
|
| 14 |
from ui.tab_policy import build_policy_tab
|
| 15 |
from ui.tab_testing import (
|
| 16 |
build_testing_tab,
|
|
@@ -24,7 +28,7 @@ from ui.tab_testing import (
|
|
| 24 |
# Handlers
|
| 25 |
# ============================================================================
|
| 26 |
|
| 27 |
-
def handle_run_test(test_input, current_policy, model_choice, reasoning_effort, max_tokens, temperature, top_p, system_prompt_val, response_format_val, oauth_token: gr.OAuthToken | None = None):
|
| 28 |
"""Handle test execution."""
|
| 29 |
|
| 30 |
if not test_input or not test_input.strip():
|
|
@@ -60,6 +64,33 @@ def handle_run_test(test_input, current_policy, model_choice, reasoning_effort,
|
|
| 60 |
model_info = format_model_info(model_choice, reasoning_effort)
|
| 61 |
reasoning_info_text, reasoning_info_visible = format_reasoning_info(model_choice, reasoning)
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
return (
|
| 64 |
model_info,
|
| 65 |
label_text,
|
|
@@ -92,6 +123,7 @@ with gr.Blocks(title="Moderation Model Testing") as demo:
|
|
| 92 |
testing_components = build_testing_tab()
|
| 93 |
test_input = testing_components["test_input"]
|
| 94 |
run_test_btn = testing_components["run_test_btn"]
|
|
|
|
| 95 |
model_info_display = testing_components["model_info_display"]
|
| 96 |
label_display = testing_components["label_display"]
|
| 97 |
categories_display = testing_components["categories_display"]
|
|
@@ -111,6 +143,11 @@ with gr.Blocks(title="Moderation Model Testing") as demo:
|
|
| 111 |
system_prompt_textbox = config_components["system_prompt_textbox"]
|
| 112 |
response_format_textbox = config_components["response_format_textbox"]
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
# ============================================================================
|
| 115 |
# Event Handlers
|
| 116 |
# ============================================================================
|
|
@@ -128,6 +165,7 @@ with gr.Blocks(title="Moderation Model Testing") as demo:
|
|
| 128 |
top_p,
|
| 129 |
system_prompt_textbox,
|
| 130 |
response_format_textbox,
|
|
|
|
| 131 |
],
|
| 132 |
outputs=[
|
| 133 |
model_info_display,
|
|
@@ -150,6 +188,93 @@ with gr.Blocks(title="Moderation Model Testing") as demo:
|
|
| 150 |
inputs=[model_dropdown, reasoning_effort],
|
| 151 |
outputs=model_info_display,
|
| 152 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
|
| 155 |
if __name__ == "__main__":
|
|
|
|
| 7 |
|
| 8 |
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 9 |
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
|
| 12 |
+
from utils.dataset import format_categories_and_reasoning, save_to_dataset
|
| 13 |
from utils.helpers import get_hf_token
|
| 14 |
from utils.model_interface import extract_model_id, run_test
|
| 15 |
from ui.sidebar import build_sidebar
|
| 16 |
from ui.tab_config import build_config_tab
|
| 17 |
+
from ui.tab_dataset import build_dataset_tab
|
| 18 |
from ui.tab_policy import build_policy_tab
|
| 19 |
from ui.tab_testing import (
|
| 20 |
build_testing_tab,
|
|
|
|
| 28 |
# Handlers
|
| 29 |
# ============================================================================
|
| 30 |
|
| 31 |
+
def handle_run_test(test_input, current_policy, model_choice, reasoning_effort, max_tokens, temperature, top_p, system_prompt_val, response_format_val, save_mode, oauth_token: gr.OAuthToken | None = None):
|
| 32 |
"""Handle test execution."""
|
| 33 |
|
| 34 |
if not test_input or not test_input.strip():
|
|
|
|
| 64 |
model_info = format_model_info(model_choice, reasoning_effort)
|
| 65 |
reasoning_info_text, reasoning_info_visible = format_reasoning_info(model_choice, reasoning)
|
| 66 |
|
| 67 |
+
# Save to dataset if enabled
|
| 68 |
+
if save_mode == "Save to Dataset" and hf_token is not None:
|
| 69 |
+
try:
|
| 70 |
+
categories_and_reasoning_text = format_categories_and_reasoning(parsed)
|
| 71 |
+
policy_violation = parsed.get("label", -1)
|
| 72 |
+
|
| 73 |
+
data = {
|
| 74 |
+
"input": test_input,
|
| 75 |
+
"policy_violation": policy_violation,
|
| 76 |
+
"categories_and_reasoning": categories_and_reasoning_text,
|
| 77 |
+
"policy": current_policy,
|
| 78 |
+
"model_selection": model_choice,
|
| 79 |
+
"raw_response": raw_response,
|
| 80 |
+
"reasoning_trace": reasoning or "",
|
| 81 |
+
"reasoning_effort": reasoning_effort or "",
|
| 82 |
+
"max_tokens": int(max_tokens),
|
| 83 |
+
"temperature": float(temperature),
|
| 84 |
+
"top_p": float(top_p),
|
| 85 |
+
"system_prompt": system_prompt_val or "",
|
| 86 |
+
"response_format": response_format_val or "",
|
| 87 |
+
"timestamp": datetime.now().isoformat(),
|
| 88 |
+
}
|
| 89 |
+
save_to_dataset(hf_token, data)
|
| 90 |
+
except Exception as e:
|
| 91 |
+
# Log error but don't break test execution
|
| 92 |
+
print(f"Failed to save to dataset: {e}")
|
| 93 |
+
|
| 94 |
return (
|
| 95 |
model_info,
|
| 96 |
label_text,
|
|
|
|
| 123 |
testing_components = build_testing_tab()
|
| 124 |
test_input = testing_components["test_input"]
|
| 125 |
run_test_btn = testing_components["run_test_btn"]
|
| 126 |
+
save_mode = testing_components["save_mode"]
|
| 127 |
model_info_display = testing_components["model_info_display"]
|
| 128 |
label_display = testing_components["label_display"]
|
| 129 |
categories_display = testing_components["categories_display"]
|
|
|
|
| 143 |
system_prompt_textbox = config_components["system_prompt_textbox"]
|
| 144 |
response_format_textbox = config_components["response_format_textbox"]
|
| 145 |
|
| 146 |
+
dataset_components = build_dataset_tab()
|
| 147 |
+
example_dropdown = dataset_components["example_dropdown"]
|
| 148 |
+
cached_examples = dataset_components["cached_examples"]
|
| 149 |
+
dropdown_choices_state = dataset_components["dropdown_choices_state"]
|
| 150 |
+
|
| 151 |
# ============================================================================
|
| 152 |
# Event Handlers
|
| 153 |
# ============================================================================
|
|
|
|
| 165 |
top_p,
|
| 166 |
system_prompt_textbox,
|
| 167 |
response_format_textbox,
|
| 168 |
+
save_mode,
|
| 169 |
],
|
| 170 |
outputs=[
|
| 171 |
model_info_display,
|
|
|
|
| 188 |
inputs=[model_dropdown, reasoning_effort],
|
| 189 |
outputs=model_info_display,
|
| 190 |
)
|
| 191 |
+
|
| 192 |
+
# Dataset load handler
|
| 193 |
+
def load_example_from_dataset(selected_label, cached_examples_list, dropdown_choices_list):
|
| 194 |
+
"""Load example from dataset and populate all fields."""
|
| 195 |
+
if (not cached_examples_list or not selected_label or
|
| 196 |
+
not dropdown_choices_list or selected_label not in dropdown_choices_list):
|
| 197 |
+
# Return None to skip updates
|
| 198 |
+
return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
|
| 199 |
+
|
| 200 |
+
try:
|
| 201 |
+
# Find index by matching label
|
| 202 |
+
idx = dropdown_choices_list.index(selected_label)
|
| 203 |
+
if idx < 0 or idx >= len(cached_examples_list):
|
| 204 |
+
return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
|
| 205 |
+
|
| 206 |
+
example = cached_examples_list[idx]
|
| 207 |
+
|
| 208 |
+
# Get policy - ensure it's a string (not None)
|
| 209 |
+
policy = example.get("policy", "") or ""
|
| 210 |
+
|
| 211 |
+
# Extract saved results
|
| 212 |
+
policy_violation = example.get("policy_violation", -1)
|
| 213 |
+
categories_and_reasoning = example.get("categories_and_reasoning", "")
|
| 214 |
+
raw_response = example.get("raw_response", "")
|
| 215 |
+
reasoning_trace = example.get("reasoning_trace", "")
|
| 216 |
+
model_selection = example.get("model_selection", "")
|
| 217 |
+
reasoning_effort_val = example.get("reasoning_effort", "")
|
| 218 |
+
|
| 219 |
+
# Format label text
|
| 220 |
+
if policy_violation == 1:
|
| 221 |
+
label_text = "## ❌ Policy Violation Detected"
|
| 222 |
+
elif policy_violation == 0:
|
| 223 |
+
label_text = "## ✅ No Policy Violation"
|
| 224 |
+
else:
|
| 225 |
+
label_text = "## ⚠️ Unable to determine label"
|
| 226 |
+
|
| 227 |
+
# Format model info
|
| 228 |
+
model_info = format_model_info(model_selection, reasoning_effort_val)
|
| 229 |
+
|
| 230 |
+
# Format reasoning info
|
| 231 |
+
reasoning_info_text, reasoning_info_visible = format_reasoning_info(model_selection, reasoning_trace)
|
| 232 |
+
|
| 233 |
+
reasoning_visible = bool(reasoning_trace and reasoning_trace.strip())
|
| 234 |
+
|
| 235 |
+
return (
|
| 236 |
+
example.get("input", ""),
|
| 237 |
+
policy, # current_policy_state - UI syncs automatically via change handler
|
| 238 |
+
example.get("model_selection", ""),
|
| 239 |
+
example.get("reasoning_effort", ""),
|
| 240 |
+
example.get("max_tokens", 0),
|
| 241 |
+
example.get("temperature", 0.0),
|
| 242 |
+
example.get("top_p", 0.0),
|
| 243 |
+
example.get("system_prompt", ""),
|
| 244 |
+
example.get("response_format", ""),
|
| 245 |
+
# Results
|
| 246 |
+
model_info,
|
| 247 |
+
label_text,
|
| 248 |
+
categories_and_reasoning,
|
| 249 |
+
raw_response,
|
| 250 |
+
gr.update(value=reasoning_info_text, visible=reasoning_info_visible),
|
| 251 |
+
gr.update(value=reasoning_trace or "", visible=reasoning_visible),
|
| 252 |
+
)
|
| 253 |
+
except (ValueError, IndexError):
|
| 254 |
+
return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
|
| 255 |
+
|
| 256 |
+
example_dropdown.change(
|
| 257 |
+
load_example_from_dataset,
|
| 258 |
+
inputs=[example_dropdown, cached_examples, dropdown_choices_state],
|
| 259 |
+
outputs=[
|
| 260 |
+
test_input,
|
| 261 |
+
current_policy_state, # UI components sync automatically via change handler
|
| 262 |
+
model_dropdown,
|
| 263 |
+
reasoning_effort,
|
| 264 |
+
max_tokens,
|
| 265 |
+
temperature,
|
| 266 |
+
top_p,
|
| 267 |
+
system_prompt_textbox,
|
| 268 |
+
response_format_textbox,
|
| 269 |
+
# Results
|
| 270 |
+
model_info_display,
|
| 271 |
+
label_display,
|
| 272 |
+
categories_display,
|
| 273 |
+
model_response_display,
|
| 274 |
+
reasoning_info,
|
| 275 |
+
reasoning_display,
|
| 276 |
+
],
|
| 277 |
+
)
|
| 278 |
|
| 279 |
|
| 280 |
if __name__ == "__main__":
|
requirements.txt
CHANGED
|
@@ -3,3 +3,4 @@ openai
|
|
| 3 |
gradio
|
| 4 |
python-dotenv
|
| 5 |
huggingface-hub
|
|
|
|
|
|
| 3 |
gradio
|
| 4 |
python-dotenv
|
| 5 |
huggingface-hub
|
| 6 |
+
datasets
|
ui/tab_policy.py
CHANGED
|
@@ -81,6 +81,17 @@ def build_policy_tab(base_dir: str) -> dict:
|
|
| 81 |
lambda: ("", "", "*No policy loaded*"),
|
| 82 |
outputs=[current_policy_state, manual_text, policy_preview],
|
| 83 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
return {
|
| 86 |
"current_policy_state": current_policy_state,
|
|
|
|
| 81 |
lambda: ("", "", "*No policy loaded*"),
|
| 82 |
outputs=[current_policy_state, manual_text, policy_preview],
|
| 83 |
)
|
| 84 |
+
|
| 85 |
+
# Sync UI components when state changes externally (e.g., from dataset load)
|
| 86 |
+
def sync_policy_ui(policy_text):
|
| 87 |
+
preview_text = policy_text if policy_text else "*No policy loaded*"
|
| 88 |
+
return policy_text, preview_text
|
| 89 |
+
|
| 90 |
+
current_policy_state.change(
|
| 91 |
+
sync_policy_ui,
|
| 92 |
+
inputs=current_policy_state,
|
| 93 |
+
outputs=[manual_text, policy_preview],
|
| 94 |
+
)
|
| 95 |
|
| 96 |
return {
|
| 97 |
"current_policy_state": current_policy_state,
|
ui/tab_testing.py
CHANGED
|
@@ -136,6 +136,11 @@ def build_testing_tab() -> dict:
|
|
| 136 |
example_dropdown = gr.Dropdown(label="Load Example", choices=list(TEST_EXAMPLES.keys()))
|
| 137 |
load_example_btn = gr.Button("Load Example", variant="secondary")
|
| 138 |
run_test_btn = gr.Button("Run Test", variant="primary")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
# Initialize with default model info
|
| 140 |
initial_model = f"{MODELS[0]['name']} ({MODELS[0]['id']})"
|
| 141 |
initial_info_lines = [
|
|
@@ -170,6 +175,7 @@ def build_testing_tab() -> dict:
|
|
| 170 |
"example_dropdown": example_dropdown,
|
| 171 |
"load_example_btn": load_example_btn,
|
| 172 |
"run_test_btn": run_test_btn,
|
|
|
|
| 173 |
"model_info_display": model_info_display,
|
| 174 |
"label_display": label_display,
|
| 175 |
"categories_display": categories_display,
|
|
|
|
| 136 |
example_dropdown = gr.Dropdown(label="Load Example", choices=list(TEST_EXAMPLES.keys()))
|
| 137 |
load_example_btn = gr.Button("Load Example", variant="secondary")
|
| 138 |
run_test_btn = gr.Button("Run Test", variant="primary")
|
| 139 |
+
save_mode = gr.Radio(
|
| 140 |
+
label="Save to Dataset",
|
| 141 |
+
choices=["Don't Save", "Save to Dataset"],
|
| 142 |
+
value="Don't Save"
|
| 143 |
+
)
|
| 144 |
# Initialize with default model info
|
| 145 |
initial_model = f"{MODELS[0]['name']} ({MODELS[0]['id']})"
|
| 146 |
initial_info_lines = [
|
|
|
|
| 175 |
"example_dropdown": example_dropdown,
|
| 176 |
"load_example_btn": load_example_btn,
|
| 177 |
"run_test_btn": run_test_btn,
|
| 178 |
+
"save_mode": save_mode,
|
| 179 |
"model_info_display": model_info_display,
|
| 180 |
"label_display": label_display,
|
| 181 |
"categories_display": categories_display,
|