Spaces:

ai-coscientist
/

ablation-bench

Running

App Files Files Community

Talor Abramovich commited on 3 days ago

Commit

05c4cde

1 Parent(s): 78a366f

ablation bench space fin

Browse files

Files changed (3) hide show

app.py +67 -98
requirements.txt +1 -1
style.css +40 -0

app.py CHANGED Viewed

@@ -34,14 +34,19 @@ def _sanitize_history(history):
         if isinstance(msg, gr.ChatMessage):
             role = msg.role
             content = msg.content
         elif isinstance(msg, dict):
             role = msg.get("role")
             content = msg.get("content", "")
         else:
             continue
         if role not in {"user", "assistant", "system"}:
             continue
-        clean.append({"role": role, "content": _normalize_message_content(content)})
     return clean
@@ -163,27 +168,22 @@ def _archive_to_tagged_source(extracted_root: Path) -> str:
 def _convert_pdf_to_markdown(pdf_path: Path) -> str:
     try:
-        from marker.converters.pdf import PdfConverter
-        from marker.models import create_model_dict
-        from marker.output import text_from_rendered
     except Exception as e:
         raise gr.Error(
-            "Marker SDK is not available. Make sure `marker-pdf` is installed."
         ) from e
     try:
-        converter = PdfConverter(artifact_dict=create_model_dict())
-        rendered = converter(str(pdf_path))
-        text, _, _ = text_from_rendered(rendered)
     except Exception as e:
-        raise gr.Error(f"PDF conversion failed with Marker SDK: {e}") from e
     text = (text or "").strip()
     if not text:
-        markdown_text = getattr(rendered, "markdown", "") if rendered is not None else ""
-        text = (markdown_text or "").strip()
-    if not text:
-        raise gr.Error("Marker SDK produced empty output for this PDF.")
     return text
@@ -237,17 +237,6 @@ def _build_paper_source_from_upload(uploaded_path: str) -> str:
         "archives (.zip/.tar/.tar.gz/.tgz/.gz/.gzip), or .pdf."
     )
-def get_all_marker_models():
-    try:
-        from marker.models import create_model_dict
-        create_model_dict()
-    except Exception as e:
-        raise gr.Error(
-            "Marker SDK is not available. Make sure `marker-pdf` is installed."
-        ) from e
 def run_single_interaction(
     message_input,
     history,
@@ -257,7 +246,6 @@ def run_single_interaction(
     top_p,
     model_id,
     provider_name,
-    interaction_locked,
     hf_token: gr.OAuthToken,
 ):
     """
@@ -267,9 +255,6 @@ def run_single_interaction(
     config = yaml.safe_load(Path("./prompts.yaml").read_text())
     prompts = config["author_ablation"] if ablation_mode == "AuthorAblation" else config["reviewer_ablation"]
-    if interaction_locked:
-        raise gr.Error("This run is complete. Click Restart to begin a new interaction.")
     prior_history = _sanitize_history(history)
     text = ""
@@ -305,7 +290,18 @@ def run_single_interaction(
         user_prompt_template.replace("{{paper_source}}", paper_source)
         .replace("{{num_ablations}}", str(num_ablations))
     )
-    user_display = f"Planning {num_ablations} ablations from submitted paper."
     client = InferenceClient(
         token=hf_token.token,
@@ -313,11 +309,23 @@ def run_single_interaction(
         provider=provider_name,
     )
-    messages = [{"role": "system", "content": prompts["system_prompt"]}, *prior_history]
-    messages.append({"role": "user", "content": user_content})
-    live_history = [gr.ChatMessage(role=item["role"], content=item["content"]) for item in prior_history]
     live_history.append(gr.ChatMessage(role="user", content=user_display))
     live_history.append(
         gr.ChatMessage(
             role="assistant",
@@ -326,7 +334,6 @@ def run_single_interaction(
         )
     )
-    done_status = "Ablation plan complete. Click Restart to run another one."
     emitted = False
     raw_output = ""
     predictions_message_idx = None
@@ -384,11 +391,7 @@ def run_single_interaction(
                     )
             emitted = True
-            yield (
-                live_history,
-                done_status,
-                True,
-            )
     except BadRequestError as e:
         message = str(e)
         if "model_not_supported" in message:
@@ -411,11 +414,7 @@ def run_single_interaction(
                 content="_No valid predictions JSONL found._",
             )
         )
-        yield (
-            live_history,
-            done_status,
-            True,
-        )
 def print_like_dislike(x: gr.LikeData):
     print(x.index, x.value, x.liked)
@@ -434,42 +433,26 @@ def change_ablation_mode(
     )
-def restart_interaction():
-    return (
-        [],
-        "Ready. Submit your paper.",
-        False,
-    )
-with gr.Blocks(
-    css="""
-    #ablation-mode label:has(input[value="AuthorAblation"]) {
-        color: #7a09b8 !important;
-        font-weight: 700;
-    }
-    #ablation-mode label:has(input[value="ReviewerAblation"]) {
-        color: #63c009 !important;
-        font-weight: 700;
-    }
-    #ablation-mode input[value="AuthorAblation"] + span,
-    #ablation-mode input[value="AuthorAblation"] ~ span {
-        color: #7a09b8 !important;
-        font-weight: 700;
-    }
-    #ablation-mode input[value="ReviewerAblation"] + span,
-    #ablation-mode input[value="ReviewerAblation"] ~ span {
-        color: #63c009 !important;
-        font-weight: 700;
-    }
-    """
-) as demo:
-    demo.load(get_all_marker_models)
     gr.Markdown(
         """
-        # Ablation Bench
-        This app is an ablation-bench interface for comparing behavior between
-        `AuthorAblation` and `ReviewerAblation` modes.
         """
     )
@@ -481,17 +464,15 @@ with gr.Blocks(
         elem_id="ablation-mode",
     )
-    status_text = gr.Markdown("Ready. Submit text or a single file.")
-    restart_btn = gr.Button("↺")
     chatbot = gr.Chatbot(
         label="Ablation Plan",
-        buttons=[restart_btn, "copy"],
     )
-    interaction_locked = gr.State(False)
     message_input = gr.MultimodalTextbox(
         label="Paper content",
-        placeholder="Paste your paper content here or upload a single PDF/MD/TEX file or a single zip/gzip file of your paper.",
         lines=5,
         file_count="single",
         file_types=[
@@ -525,12 +506,13 @@ with gr.Blocks(
         model_id = gr.Dropdown(
             choices=[
                 "openai/gpt-oss-120b",
-                "zai-org/GLM-5",
                 "moonshotai/Kimi-K2.5",
                 "moonshotai/Kimi-K2-Thinking",
                 "moonshotai/Kimi-K2-Instruct",
                 "deepseek-ai/DeepSeek-V3.2",
-                "MiniMaxAI/MiniMax-M2.5",
                 "Qwen/Qwen3-235B-A22B-Instruct-2507",
             ],
             value="openai/gpt-oss-120b",
@@ -581,6 +563,7 @@ with gr.Blocks(
         )
     with gr.Sidebar():
         gr.LoginButton()
     message_input.submit(
@@ -594,30 +577,16 @@ with gr.Blocks(
             top_p,
             model_id,
             provider_name,
-            interaction_locked,
         ],
         outputs=[
             chatbot,
-            status_text,
-            interaction_locked,
-        ],
-    )
-    restart_btn.click(
-        restart_interaction,
-        outputs=[
-            chatbot,
-            status_text,
-            interaction_locked,
         ],
     )
     chatbot.clear(
-        restart_interaction,
         outputs=[
             chatbot,
-            status_text,
-            interaction_locked,
         ]
     )
@@ -633,4 +602,4 @@ with gr.Blocks(
     chatbot.like(print_like_dislike)
 if __name__ == "__main__":
-    demo.launch()

         if isinstance(msg, gr.ChatMessage):
             role = msg.role
             content = msg.content
+            metadata = msg.metadata
         elif isinstance(msg, dict):
             role = msg.get("role")
             content = msg.get("content", "")
+            metadata = msg.get("metadata")
         else:
             continue
         if role not in {"user", "assistant", "system"}:
             continue
+        message = {"role": role, "content": _normalize_message_content(content)}
+        if metadata:
+            message["metadata"] = metadata
+        clean.append(message)
     return clean
 def _convert_pdf_to_markdown(pdf_path: Path) -> str:
     try:
+        from markitdown import MarkItDown
     except Exception as e:
         raise gr.Error(
+            "MarkItDown SDK is not available. Make sure `markitdown[pdf]` is installed."
         ) from e
     try:
+        converter = MarkItDown(enable_plugins=False)
+        result = converter.convert(str(pdf_path))
+        text = result.text_content
     except Exception as e:
+        raise gr.Error(f"PDF conversion failed with MarkItDown SDK: {e}") from e
     text = (text or "").strip()
     if not text:
+        raise gr.Error("MarkItDown SDK produced empty output for this PDF.")
     return text
         "archives (.zip/.tar/.tar.gz/.tgz/.gz/.gzip), or .pdf."
     )
 def run_single_interaction(
     message_input,
     history,
     top_p,
     model_id,
     provider_name,
     hf_token: gr.OAuthToken,
 ):
     """
     config = yaml.safe_load(Path("./prompts.yaml").read_text())
     prompts = config["author_ablation"] if ablation_mode == "AuthorAblation" else config["reviewer_ablation"]
     prior_history = _sanitize_history(history)
     text = ""
         user_prompt_template.replace("{{paper_source}}", paper_source)
         .replace("{{num_ablations}}", str(num_ablations))
     )
+    if has_file:
+        source_hint = f"file: {file_label}"
+    else:
+        first_line = (text.splitlines()[0] if text else "").strip()
+        first_line_words = first_line.split()[:100]
+        preview = " ".join(first_line_words)
+        source_hint = f"text preview: {preview}" if preview else "text preview: (empty)"
+    if ablation_mode == "AuthorAblation":
+        user_display = f"Planning {num_ablations} ablations for submitted paper ({source_hint})."
+    else:
+        user_display = f"Reviewing and suggesting {num_ablations} missing ablations for submitted paper ({source_hint})."
     client = InferenceClient(
         token=hf_token.token,
         provider=provider_name,
     )
+    # Keep full chat visible to users, but send only current input to model.
+    messages = [
+        {"role": "system", "content": prompts["system_prompt"]},
+        {"role": "user", "content": user_content},
+    ]
+    live_history = [
+        gr.ChatMessage(
+            role=item["role"],
+            content=item["content"],
+            metadata=item.get("metadata") or {},
+        )
+        for item in prior_history
+    ]
     live_history.append(gr.ChatMessage(role="user", content=user_display))
+    if has_file and ablation_mode == "AuthorAblation" and "ablat" in paper_source.lower():
+        gr.Warning("Uploaded paper appears to already contain ablation content (`ablat*`).")
     live_history.append(
         gr.ChatMessage(
             role="assistant",
         )
     )
     emitted = False
     raw_output = ""
     predictions_message_idx = None
                     )
             emitted = True
+            yield live_history
     except BadRequestError as e:
         message = str(e)
         if "model_not_supported" in message:
                 content="_No valid predictions JSONL found._",
             )
         )
+        yield live_history
 def print_like_dislike(x: gr.LikeData):
     print(x.index, x.value, x.liked)
     )
+def clear_chat():
+    return []
+with gr.Blocks() as demo:
     gr.Markdown(
         """
+        # <span class="ablationbench">AblationBench:</span> Evaluating Automated Planning of Ablations in Empirical AI Research
+        Can models help automate the design of ablation experiments in scientific papers? To explore this, we introduce <span class="ablationbench">AblationBench</span>, a benchmark for evaluating models on ablation planning in empirical AI research. It includes two tasks: <span class="authorablation">AuthorAblation</span>, where the model helps authors to propose ablations from a written method section, and <span class="reviewerablation">ReviewerAblation</span>, where it help reviewers finding and suggesting missing ablations in a full paper.
+        This demo shows you how models can plan ablations for your papers using our baseline LM-Planner.
+        You can choose between the two tasks, and upload your paper as a text or as a file, to plan ablations or find missing ablations in your paper.
+        For best results, follow these guidelines:
+        1. In <span class="authorablation">AuthorAblation</span>, the uploaded paper should include the method section, and should not contain any ablation experiments.
+        2. For both tasks, it is better to use text files than PDFs, or upload the zip file of your project, downloaded from overleaf.
+        Want to read more? You are welcome to visit our [🌍 project page](https://ablation-bench.github.io/#/), evaluate on our [🤗 benchmark](https://huggingface.co/collections/ai-coscientist/ablationbench) and read our [📎 paper](https://www.arxiv.org/abs/2507.08038).
         """
     )
         elem_id="ablation-mode",
     )
     chatbot = gr.Chatbot(
         label="Ablation Plan",
+        buttons=["copy"],
+        avatar_images=("https://ablation-bench.github.io/_media/user_avatar.png", "https://ablation-bench.github.io/_media/lm_avatar.png"),
     )
     message_input = gr.MultimodalTextbox(
         label="Paper content",
+        placeholder="Enter your paper text here, or upload one file: TEX, MD, PDF, ZIP, or GZIP.",
         lines=5,
         file_count="single",
         file_types=[
         model_id = gr.Dropdown(
             choices=[
                 "openai/gpt-oss-120b",
+                "MiniMaxAI/MiniMax-M2.5",
+                "Qwen/Qwen3.5-397B-A17B",
                 "moonshotai/Kimi-K2.5",
                 "moonshotai/Kimi-K2-Thinking",
                 "moonshotai/Kimi-K2-Instruct",
                 "deepseek-ai/DeepSeek-V3.2",
+                "zai-org/GLM-5",
                 "Qwen/Qwen3-235B-A22B-Instruct-2507",
             ],
             value="openai/gpt-oss-120b",
         )
     with gr.Sidebar():
+        gr.Markdown("""<center><img src="https://ablation-bench.github.io/_media/icon.png"></center>""")
         gr.LoginButton()
     message_input.submit(
             top_p,
             model_id,
             provider_name,
         ],
         outputs=[
             chatbot,
         ],
     )
     chatbot.clear(
+        clear_chat,
         outputs=[
             chatbot,
         ]
     )
     chatbot.like(print_like_dislike)
 if __name__ == "__main__":
+    demo.launch(css_paths=Path("style.css"))

requirements.txt CHANGED Viewed

	@@ -1,2 +1,2 @@
1	pytz
2	- ~~marker-~~pdf


1	pytz
2	+ markitdown[pdf]

style.css ADDED Viewed

	@@ -0,0 +1,40 @@

+#ablation-mode label:has(input[value="AuthorAblation"]) {
+    color: #7a09b8 !important;
+    font-weight: 700;
+}
+#ablation-mode label:has(input[value="ReviewerAblation"]) {
+    color: #63c009 !important;
+    font-weight: 700;
+}
+#ablation-mode input[value="AuthorAblation"] + span,
+#ablation-mode input[value="AuthorAblation"] ~ span {
+    color: #7a09b8 !important;
+    font-weight: 700;
+}
+#ablation-mode input[value="ReviewerAblation"] + span,
+#ablation-mode input[value="ReviewerAblation"] ~ span {
+    color: #63c009 !important;
+    font-weight: 700;
+}
+.ablationbench {
+  background: linear-gradient(to right, #0C69DA,rgb(129, 176, 233));
+  -webkit-text-fill-color: transparent;
+  -webkit-background-clip: text;
+  font-weight: bold;
+  font-style: italic;
+}
+.authorablation {
+  background: linear-gradient(to right, rgb(196, 124, 235),rgb(196, 124, 235));
+  -webkit-text-fill-color: transparent;
+  -webkit-background-clip: text;
+  font-style: italic;
+}
+.reviewerablation {
+  background: linear-gradient(to right, #60BF00, #60BF00);
+  -webkit-text-fill-color: transparent;
+  -webkit-background-clip: text;
+  font-style: italic;
+}