Talor Abramovich commited on
Commit
05c4cde
·
1 Parent(s): 78a366f

ablation bench space fin

Browse files
Files changed (3) hide show
  1. app.py +67 -98
  2. requirements.txt +1 -1
  3. style.css +40 -0
app.py CHANGED
@@ -34,14 +34,19 @@ def _sanitize_history(history):
34
  if isinstance(msg, gr.ChatMessage):
35
  role = msg.role
36
  content = msg.content
 
37
  elif isinstance(msg, dict):
38
  role = msg.get("role")
39
  content = msg.get("content", "")
 
40
  else:
41
  continue
42
  if role not in {"user", "assistant", "system"}:
43
  continue
44
- clean.append({"role": role, "content": _normalize_message_content(content)})
 
 
 
45
  return clean
46
 
47
 
@@ -163,27 +168,22 @@ def _archive_to_tagged_source(extracted_root: Path) -> str:
163
 
164
  def _convert_pdf_to_markdown(pdf_path: Path) -> str:
165
  try:
166
- from marker.converters.pdf import PdfConverter
167
- from marker.models import create_model_dict
168
- from marker.output import text_from_rendered
169
  except Exception as e:
170
  raise gr.Error(
171
- "Marker SDK is not available. Make sure `marker-pdf` is installed."
172
  ) from e
173
 
174
  try:
175
- converter = PdfConverter(artifact_dict=create_model_dict())
176
- rendered = converter(str(pdf_path))
177
- text, _, _ = text_from_rendered(rendered)
178
  except Exception as e:
179
- raise gr.Error(f"PDF conversion failed with Marker SDK: {e}") from e
180
 
181
  text = (text or "").strip()
182
  if not text:
183
- markdown_text = getattr(rendered, "markdown", "") if rendered is not None else ""
184
- text = (markdown_text or "").strip()
185
- if not text:
186
- raise gr.Error("Marker SDK produced empty output for this PDF.")
187
  return text
188
 
189
 
@@ -237,17 +237,6 @@ def _build_paper_source_from_upload(uploaded_path: str) -> str:
237
  "archives (.zip/.tar/.tar.gz/.tgz/.gz/.gzip), or .pdf."
238
  )
239
 
240
-
241
- def get_all_marker_models():
242
- try:
243
- from marker.models import create_model_dict
244
- create_model_dict()
245
- except Exception as e:
246
- raise gr.Error(
247
- "Marker SDK is not available. Make sure `marker-pdf` is installed."
248
- ) from e
249
-
250
-
251
  def run_single_interaction(
252
  message_input,
253
  history,
@@ -257,7 +246,6 @@ def run_single_interaction(
257
  top_p,
258
  model_id,
259
  provider_name,
260
- interaction_locked,
261
  hf_token: gr.OAuthToken,
262
  ):
263
  """
@@ -267,9 +255,6 @@ def run_single_interaction(
267
  config = yaml.safe_load(Path("./prompts.yaml").read_text())
268
  prompts = config["author_ablation"] if ablation_mode == "AuthorAblation" else config["reviewer_ablation"]
269
 
270
- if interaction_locked:
271
- raise gr.Error("This run is complete. Click Restart to begin a new interaction.")
272
-
273
  prior_history = _sanitize_history(history)
274
 
275
  text = ""
@@ -305,7 +290,18 @@ def run_single_interaction(
305
  user_prompt_template.replace("{{paper_source}}", paper_source)
306
  .replace("{{num_ablations}}", str(num_ablations))
307
  )
308
- user_display = f"Planning {num_ablations} ablations from submitted paper."
 
 
 
 
 
 
 
 
 
 
 
309
 
310
  client = InferenceClient(
311
  token=hf_token.token,
@@ -313,11 +309,23 @@ def run_single_interaction(
313
  provider=provider_name,
314
  )
315
 
316
- messages = [{"role": "system", "content": prompts["system_prompt"]}, *prior_history]
317
- messages.append({"role": "user", "content": user_content})
 
 
 
318
 
319
- live_history = [gr.ChatMessage(role=item["role"], content=item["content"]) for item in prior_history]
 
 
 
 
 
 
 
320
  live_history.append(gr.ChatMessage(role="user", content=user_display))
 
 
321
  live_history.append(
322
  gr.ChatMessage(
323
  role="assistant",
@@ -326,7 +334,6 @@ def run_single_interaction(
326
  )
327
  )
328
 
329
- done_status = "Ablation plan complete. Click Restart to run another one."
330
  emitted = False
331
  raw_output = ""
332
  predictions_message_idx = None
@@ -384,11 +391,7 @@ def run_single_interaction(
384
  )
385
 
386
  emitted = True
387
- yield (
388
- live_history,
389
- done_status,
390
- True,
391
- )
392
  except BadRequestError as e:
393
  message = str(e)
394
  if "model_not_supported" in message:
@@ -411,11 +414,7 @@ def run_single_interaction(
411
  content="_No valid predictions JSONL found._",
412
  )
413
  )
414
- yield (
415
- live_history,
416
- done_status,
417
- True,
418
- )
419
 
420
  def print_like_dislike(x: gr.LikeData):
421
  print(x.index, x.value, x.liked)
@@ -434,42 +433,26 @@ def change_ablation_mode(
434
  )
435
 
436
 
437
- def restart_interaction():
438
- return (
439
- [],
440
- "Ready. Submit your paper.",
441
- False,
442
- )
443
 
444
 
445
- with gr.Blocks(
446
- css="""
447
- #ablation-mode label:has(input[value="AuthorAblation"]) {
448
- color: #7a09b8 !important;
449
- font-weight: 700;
450
- }
451
- #ablation-mode label:has(input[value="ReviewerAblation"]) {
452
- color: #63c009 !important;
453
- font-weight: 700;
454
- }
455
- #ablation-mode input[value="AuthorAblation"] + span,
456
- #ablation-mode input[value="AuthorAblation"] ~ span {
457
- color: #7a09b8 !important;
458
- font-weight: 700;
459
- }
460
- #ablation-mode input[value="ReviewerAblation"] + span,
461
- #ablation-mode input[value="ReviewerAblation"] ~ span {
462
- color: #63c009 !important;
463
- font-weight: 700;
464
- }
465
- """
466
- ) as demo:
467
- demo.load(get_all_marker_models)
468
  gr.Markdown(
469
  """
470
- # Ablation Bench
471
- This app is an ablation-bench interface for comparing behavior between
472
- `AuthorAblation` and `ReviewerAblation` modes.
 
 
 
 
 
 
 
 
 
 
473
  """
474
  )
475
 
@@ -481,17 +464,15 @@ with gr.Blocks(
481
  elem_id="ablation-mode",
482
  )
483
 
484
- status_text = gr.Markdown("Ready. Submit text or a single file.")
485
- restart_btn = gr.Button("↺")
486
  chatbot = gr.Chatbot(
487
  label="Ablation Plan",
488
- buttons=[restart_btn, "copy"],
 
489
  )
490
- interaction_locked = gr.State(False)
491
 
492
  message_input = gr.MultimodalTextbox(
493
  label="Paper content",
494
- placeholder="Paste your paper content here or upload a single PDF/MD/TEX file or a single zip/gzip file of your paper.",
495
  lines=5,
496
  file_count="single",
497
  file_types=[
@@ -525,12 +506,13 @@ with gr.Blocks(
525
  model_id = gr.Dropdown(
526
  choices=[
527
  "openai/gpt-oss-120b",
528
- "zai-org/GLM-5",
 
529
  "moonshotai/Kimi-K2.5",
530
  "moonshotai/Kimi-K2-Thinking",
531
  "moonshotai/Kimi-K2-Instruct",
532
  "deepseek-ai/DeepSeek-V3.2",
533
- "MiniMaxAI/MiniMax-M2.5",
534
  "Qwen/Qwen3-235B-A22B-Instruct-2507",
535
  ],
536
  value="openai/gpt-oss-120b",
@@ -581,6 +563,7 @@ with gr.Blocks(
581
  )
582
 
583
  with gr.Sidebar():
 
584
  gr.LoginButton()
585
 
586
  message_input.submit(
@@ -594,30 +577,16 @@ with gr.Blocks(
594
  top_p,
595
  model_id,
596
  provider_name,
597
- interaction_locked,
598
  ],
599
  outputs=[
600
  chatbot,
601
- status_text,
602
- interaction_locked,
603
- ],
604
- )
605
-
606
- restart_btn.click(
607
- restart_interaction,
608
- outputs=[
609
- chatbot,
610
- status_text,
611
- interaction_locked,
612
  ],
613
  )
614
 
615
  chatbot.clear(
616
- restart_interaction,
617
  outputs=[
618
  chatbot,
619
- status_text,
620
- interaction_locked,
621
  ]
622
  )
623
 
@@ -633,4 +602,4 @@ with gr.Blocks(
633
  chatbot.like(print_like_dislike)
634
 
635
  if __name__ == "__main__":
636
- demo.launch()
 
34
  if isinstance(msg, gr.ChatMessage):
35
  role = msg.role
36
  content = msg.content
37
+ metadata = msg.metadata
38
  elif isinstance(msg, dict):
39
  role = msg.get("role")
40
  content = msg.get("content", "")
41
+ metadata = msg.get("metadata")
42
  else:
43
  continue
44
  if role not in {"user", "assistant", "system"}:
45
  continue
46
+ message = {"role": role, "content": _normalize_message_content(content)}
47
+ if metadata:
48
+ message["metadata"] = metadata
49
+ clean.append(message)
50
  return clean
51
 
52
 
 
168
 
169
  def _convert_pdf_to_markdown(pdf_path: Path) -> str:
170
  try:
171
+ from markitdown import MarkItDown
 
 
172
  except Exception as e:
173
  raise gr.Error(
174
+ "MarkItDown SDK is not available. Make sure `markitdown[pdf]` is installed."
175
  ) from e
176
 
177
  try:
178
+ converter = MarkItDown(enable_plugins=False)
179
+ result = converter.convert(str(pdf_path))
180
+ text = result.text_content
181
  except Exception as e:
182
+ raise gr.Error(f"PDF conversion failed with MarkItDown SDK: {e}") from e
183
 
184
  text = (text or "").strip()
185
  if not text:
186
+ raise gr.Error("MarkItDown SDK produced empty output for this PDF.")
 
 
 
187
  return text
188
 
189
 
 
237
  "archives (.zip/.tar/.tar.gz/.tgz/.gz/.gzip), or .pdf."
238
  )
239
 
 
 
 
 
 
 
 
 
 
 
 
240
  def run_single_interaction(
241
  message_input,
242
  history,
 
246
  top_p,
247
  model_id,
248
  provider_name,
 
249
  hf_token: gr.OAuthToken,
250
  ):
251
  """
 
255
  config = yaml.safe_load(Path("./prompts.yaml").read_text())
256
  prompts = config["author_ablation"] if ablation_mode == "AuthorAblation" else config["reviewer_ablation"]
257
 
 
 
 
258
  prior_history = _sanitize_history(history)
259
 
260
  text = ""
 
290
  user_prompt_template.replace("{{paper_source}}", paper_source)
291
  .replace("{{num_ablations}}", str(num_ablations))
292
  )
293
+ if has_file:
294
+ source_hint = f"file: {file_label}"
295
+ else:
296
+ first_line = (text.splitlines()[0] if text else "").strip()
297
+ first_line_words = first_line.split()[:100]
298
+ preview = " ".join(first_line_words)
299
+ source_hint = f"text preview: {preview}" if preview else "text preview: (empty)"
300
+
301
+ if ablation_mode == "AuthorAblation":
302
+ user_display = f"Planning {num_ablations} ablations for submitted paper ({source_hint})."
303
+ else:
304
+ user_display = f"Reviewing and suggesting {num_ablations} missing ablations for submitted paper ({source_hint})."
305
 
306
  client = InferenceClient(
307
  token=hf_token.token,
 
309
  provider=provider_name,
310
  )
311
 
312
+ # Keep full chat visible to users, but send only current input to model.
313
+ messages = [
314
+ {"role": "system", "content": prompts["system_prompt"]},
315
+ {"role": "user", "content": user_content},
316
+ ]
317
 
318
+ live_history = [
319
+ gr.ChatMessage(
320
+ role=item["role"],
321
+ content=item["content"],
322
+ metadata=item.get("metadata") or {},
323
+ )
324
+ for item in prior_history
325
+ ]
326
  live_history.append(gr.ChatMessage(role="user", content=user_display))
327
+ if has_file and ablation_mode == "AuthorAblation" and "ablat" in paper_source.lower():
328
+ gr.Warning("Uploaded paper appears to already contain ablation content (`ablat*`).")
329
  live_history.append(
330
  gr.ChatMessage(
331
  role="assistant",
 
334
  )
335
  )
336
 
 
337
  emitted = False
338
  raw_output = ""
339
  predictions_message_idx = None
 
391
  )
392
 
393
  emitted = True
394
+ yield live_history
 
 
 
 
395
  except BadRequestError as e:
396
  message = str(e)
397
  if "model_not_supported" in message:
 
414
  content="_No valid predictions JSONL found._",
415
  )
416
  )
417
+ yield live_history
 
 
 
 
418
 
419
  def print_like_dislike(x: gr.LikeData):
420
  print(x.index, x.value, x.liked)
 
433
  )
434
 
435
 
436
+ def clear_chat():
437
+ return []
 
 
 
 
438
 
439
 
440
+ with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
  gr.Markdown(
442
  """
443
+ # <span class="ablationbench">AblationBench:</span> Evaluating Automated Planning of Ablations in Empirical AI Research
444
+
445
+ Can models help automate the design of ablation experiments in scientific papers? To explore this, we introduce <span class="ablationbench">AblationBench</span>, a benchmark for evaluating models on ablation planning in empirical AI research. It includes two tasks: <span class="authorablation">AuthorAblation</span>, where the model helps authors to propose ablations from a written method section, and <span class="reviewerablation">ReviewerAblation</span>, where it help reviewers finding and suggesting missing ablations in a full paper.
446
+
447
+ This demo shows you how models can plan ablations for your papers using our baseline LM-Planner.
448
+
449
+ You can choose between the two tasks, and upload your paper as a text or as a file, to plan ablations or find missing ablations in your paper.
450
+
451
+ For best results, follow these guidelines:
452
+ 1. In <span class="authorablation">AuthorAblation</span>, the uploaded paper should include the method section, and should not contain any ablation experiments.
453
+ 2. For both tasks, it is better to use text files than PDFs, or upload the zip file of your project, downloaded from overleaf.
454
+
455
+ Want to read more? You are welcome to visit our [🌍 project page](https://ablation-bench.github.io/#/), evaluate on our [🤗 benchmark](https://huggingface.co/collections/ai-coscientist/ablationbench) and read our [📎 paper](https://www.arxiv.org/abs/2507.08038).
456
  """
457
  )
458
 
 
464
  elem_id="ablation-mode",
465
  )
466
 
 
 
467
  chatbot = gr.Chatbot(
468
  label="Ablation Plan",
469
+ buttons=["copy"],
470
+ avatar_images=("https://ablation-bench.github.io/_media/user_avatar.png", "https://ablation-bench.github.io/_media/lm_avatar.png"),
471
  )
 
472
 
473
  message_input = gr.MultimodalTextbox(
474
  label="Paper content",
475
+ placeholder="Enter your paper text here, or upload one file: TEX, MD, PDF, ZIP, or GZIP.",
476
  lines=5,
477
  file_count="single",
478
  file_types=[
 
506
  model_id = gr.Dropdown(
507
  choices=[
508
  "openai/gpt-oss-120b",
509
+ "MiniMaxAI/MiniMax-M2.5",
510
+ "Qwen/Qwen3.5-397B-A17B",
511
  "moonshotai/Kimi-K2.5",
512
  "moonshotai/Kimi-K2-Thinking",
513
  "moonshotai/Kimi-K2-Instruct",
514
  "deepseek-ai/DeepSeek-V3.2",
515
+ "zai-org/GLM-5",
516
  "Qwen/Qwen3-235B-A22B-Instruct-2507",
517
  ],
518
  value="openai/gpt-oss-120b",
 
563
  )
564
 
565
  with gr.Sidebar():
566
+ gr.Markdown("""<center><img src="https://ablation-bench.github.io/_media/icon.png"></center>""")
567
  gr.LoginButton()
568
 
569
  message_input.submit(
 
577
  top_p,
578
  model_id,
579
  provider_name,
 
580
  ],
581
  outputs=[
582
  chatbot,
 
 
 
 
 
 
 
 
 
 
 
583
  ],
584
  )
585
 
586
  chatbot.clear(
587
+ clear_chat,
588
  outputs=[
589
  chatbot,
 
 
590
  ]
591
  )
592
 
 
602
  chatbot.like(print_like_dislike)
603
 
604
  if __name__ == "__main__":
605
+ demo.launch(css_paths=Path("style.css"))
requirements.txt CHANGED
@@ -1,2 +1,2 @@
1
  pytz
2
- marker-pdf
 
1
  pytz
2
+ markitdown[pdf]
style.css ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ablation-mode label:has(input[value="AuthorAblation"]) {
2
+ color: #7a09b8 !important;
3
+ font-weight: 700;
4
+ }
5
+ #ablation-mode label:has(input[value="ReviewerAblation"]) {
6
+ color: #63c009 !important;
7
+ font-weight: 700;
8
+ }
9
+ #ablation-mode input[value="AuthorAblation"] + span,
10
+ #ablation-mode input[value="AuthorAblation"] ~ span {
11
+ color: #7a09b8 !important;
12
+ font-weight: 700;
13
+ }
14
+ #ablation-mode input[value="ReviewerAblation"] + span,
15
+ #ablation-mode input[value="ReviewerAblation"] ~ span {
16
+ color: #63c009 !important;
17
+ font-weight: 700;
18
+ }
19
+
20
+ .ablationbench {
21
+ background: linear-gradient(to right, #0C69DA,rgb(129, 176, 233));
22
+ -webkit-text-fill-color: transparent;
23
+ -webkit-background-clip: text;
24
+ font-weight: bold;
25
+ font-style: italic;
26
+ }
27
+
28
+ .authorablation {
29
+ background: linear-gradient(to right, rgb(196, 124, 235),rgb(196, 124, 235));
30
+ -webkit-text-fill-color: transparent;
31
+ -webkit-background-clip: text;
32
+ font-style: italic;
33
+ }
34
+
35
+ .reviewerablation {
36
+ background: linear-gradient(to right, #60BF00, #60BF00);
37
+ -webkit-text-fill-color: transparent;
38
+ -webkit-background-clip: text;
39
+ font-style: italic;
40
+ }