Spaces:
Runtime error
Runtime error
| """ | |
| You do not need to run this program yourself. It is hosted on Hugging Face | |
| Spaces at: | |
| https://huggingface.co/spaces/nuprl/BigCodeBench-MultiPL-Stdio-Problem-Inspector | |
| If you want to run it yourself, you can do the following: | |
| We use this program to help inspect our synthesized problems. These are the | |
| steps to run it end-to-end: | |
| 1. Create a jsonl file that joins synthesized problems with their execution | |
| results. | |
| uv run python3 -m bigcodebench_multipl.stdio_problem_inspector upload \ | |
| --problems-path unfiltered_stdio.jsonl \ | |
| --results-path unfiltered_stdio.results.jsonl \ | |
| --output-path unfiltered_stdio.joined.jsonl | |
| 2. Upload the dataset to the Hugging Face Hub for the next steps. | |
| mkdir python_stdio | |
| mv unfiltered_stdio.joined.jsonl python_stdio/test.jsonl | |
| Now, drag and drop the *folder* above to a Hugging Face dataset. | |
| 3. Run the inspector: | |
| uv run python3 -m bigcodebench_multipl.stdio_problem_inspector dataset-inspector | |
| """ | |
| import argparse | |
| import pandas as pd | |
| import gradio as gr | |
| import datasets | |
| from pathlib import Path | |
| import datasets | |
| import ast | |
| from typing import TypedDict, Generator | |
| ################################################################################ | |
| # Copy-pasted from bcb_reader.py. # | |
| ################################################################################ | |
| # This is the format of BigCodeBench problems. However, BigCodeBench-Hard has | |
| # a few extra columns. | |
| class _OriginalBigCodeBenchProblem(TypedDict): | |
| task_id: str | |
| complete_prompt: str | |
| instruct_prompt: str | |
| canonical_solution: str | |
| code_prompt: str | |
| test: str | |
| entry_point: str | |
| doc_struct: str | |
| libs: str | |
| class BigCodeBenchProblem(TypedDict): | |
| task_id: str | |
| problem: str | |
| solution: str | |
| tests: str | |
| _PROMPT_BOILERPLATE = "\nYou should write self-contained code starting with:\n```\n" | |
| _PROMPT_SUFFIX = "```" | |
| def _prepare_bcb_problem(item: _OriginalBigCodeBenchProblem) -> BigCodeBenchProblem: | |
| """ | |
| Every BCB problem has a canonical solution, which is a completion expected | |
| from a base model. This function splits the prompt to get a complete | |
| solution.""" | |
| instruct_prompt = item["instruct_prompt"] | |
| problem, solution_prefix = instruct_prompt.split(_PROMPT_BOILERPLATE, maxsplit=1) | |
| assert solution_prefix.endswith( | |
| _PROMPT_SUFFIX | |
| ), f"Prompt ends with {solution_prefix[-20:].__repr__()}" | |
| solution_prefix = solution_prefix[: -len(_PROMPT_SUFFIX)] | |
| solution = solution_prefix + item["canonical_solution"] | |
| tests = item["test"] | |
| # As a sanity check, parse. We get syntax warnings on standard error. | |
| ast.parse(solution, filename=item["task_id"]) | |
| ast.parse(tests, filename="test_" + item["task_id"]) | |
| return BigCodeBenchProblem( | |
| task_id=item["task_id"], | |
| problem=problem, | |
| solution=solution, | |
| tests=tests, | |
| ) | |
| def load_bigcodebench() -> Generator[BigCodeBenchProblem, None, None]: | |
| """ " | |
| Loads the BigCodeBench dataset in a format appropriate for translation. | |
| """ | |
| bcb = datasets.load_dataset("bigcode/bigcodebench", split="v0.1.4") | |
| for item in bcb: | |
| yield _prepare_bcb_problem(item) | |
| ################################################################################ | |
| def upload(problems_path: Path, results_path: Path, output_path: Path): | |
| problems = pd.read_json(problems_path, lines=True) | |
| results = pd.read_json(results_path, lines=True) | |
| joined = problems.merge(results, on="task_id", how="left") | |
| assert list(joined.columns) == [ | |
| "reasoning", | |
| "prompt", | |
| "program", | |
| "test_suite", | |
| "task_id", | |
| "timeout", | |
| "exit_code", | |
| "stdout", | |
| "stderr", | |
| ], "Unexpected columns after the join. Are you sure you are merging the right files?" | |
| joined.to_json(output_path, orient="records", lines=True) | |
| def dataset_inspector(dataset_name: str, data_dir: str): | |
| dataset = datasets.load_dataset(dataset_name, data_dir=data_dir, split="test") | |
| original_dataset = pd.DataFrame(load_bigcodebench()) | |
| original_dataset = original_dataset.rename(columns={ | |
| "problem": "original_prompt", | |
| "solution": "original_program", | |
| "tests": "original_test_suite", | |
| }) | |
| # Convert to pandas DataFrame for easier manipulation | |
| df = dataset.to_pandas() | |
| df = df.merge(original_dataset, on="task_id", how="left") | |
| def get_filtered_data(predicate): | |
| """Filter the dataset based on predicate""" | |
| filtered_df = df.copy() | |
| selector = False | |
| if predicate.get('filter_timeout', False): | |
| selector = selector | (filtered_df['timeout'] == True) | |
| if predicate.get('filter_successes', False): | |
| selector = selector | (filtered_df['exit_code'] == 0) | |
| if predicate.get('filter_errors', False): | |
| # We use exit_code < 0 for timeout. | |
| selector = selector | (filtered_df['exit_code'] > 0) | |
| return filtered_df[selector] | |
| def format_problem_display(row, predicate): | |
| """Format a single problem for display - returns (generated_content, original_content)""" | |
| generated_content = [] | |
| original_content = [] | |
| # Add reasoning to generated side if checkbox is checked | |
| if predicate.get('show_reasoning', False): | |
| generated_content.append("## Reasoning") | |
| generated_content.append(str(row['reasoning'])) | |
| generated_content.append("") | |
| # Generated content | |
| generated_content.append("# Generated") | |
| generated_content.append("") | |
| generated_content.append("## Prompt") | |
| generated_content.append(str(row['prompt'])) | |
| generated_content.append("") | |
| generated_content.append("## Program") | |
| generated_content.append("```python") | |
| generated_content.append(str(row['program'])) | |
| generated_content.append("```") | |
| generated_content.append("") | |
| generated_content.append("## Test Suite") | |
| generated_content.append("```python") | |
| generated_content.append(str(row['test_suite'])) | |
| generated_content.append("```") | |
| generated_content.append("") | |
| # Add execution results to generated side | |
| if str(row['stdout']).strip(): | |
| generated_content.append("## Standard Output") | |
| generated_content.append("```") | |
| generated_content.append(str(row['stdout'])) | |
| generated_content.append("```") | |
| generated_content.append("") | |
| if str(row['stderr']).strip(): | |
| generated_content.append("## Standard Error") | |
| generated_content.append("```") | |
| generated_content.append(str(row['stderr'])) | |
| generated_content.append("```") | |
| generated_content.append("") | |
| generated_content.append("## Metadata") | |
| generated_content.append(f"**Task ID:** {row['task_id']}") | |
| generated_content.append(f"**Timeout:** {row['timeout']}") | |
| generated_content.append(f"**Exit Code:** {row['exit_code']}") | |
| # Original content | |
| original_content.append("# Original") | |
| original_content.append("") | |
| original_content.append("## Prompt") | |
| original_content.append(str(row['original_prompt'])) | |
| original_content.append("") | |
| original_content.append("## Program") | |
| original_content.append("```python") | |
| original_content.append(str(row['original_program'])) | |
| original_content.append("```") | |
| original_content.append("") | |
| original_content.append("## Test Suite") | |
| original_content.append("```python") | |
| original_content.append(str(row['original_test_suite'])) | |
| original_content.append("```") | |
| return "\n".join(generated_content), "\n".join(original_content) | |
| def update_display(current_index, predicate): | |
| """Update the display based on current predicate and index""" | |
| filtered_df = get_filtered_data(predicate) | |
| if len(filtered_df) == 0: | |
| return "No problems match the current filters.", "No problems match the current filters.", f"0 / 0", gr.update(interactive=False), gr.update(interactive=False) | |
| # Ensure index is within bounds | |
| current_index = max(0, min(current_index, len(filtered_df) - 1)) | |
| row = filtered_df.iloc[current_index] | |
| generated_content, original_content = format_problem_display(row, predicate) | |
| status = f"{current_index + 1} / {len(filtered_df)}" | |
| # Update button states | |
| prev_enabled = current_index > 0 | |
| next_enabled = current_index < len(filtered_df) - 1 | |
| return generated_content, original_content, status, gr.update(interactive=prev_enabled), gr.update(interactive=next_enabled) | |
| def go_prev(current_index, predicate): | |
| """Go to previous problem""" | |
| new_index = max(0, current_index - 1) | |
| generated_content, original_content, status, prev_btn, next_btn = update_display(new_index, predicate) | |
| return generated_content, original_content, status, new_index, prev_btn, next_btn | |
| def go_next(current_index, predicate): | |
| """Go to next problem""" | |
| filtered_df = get_filtered_data(predicate) | |
| new_index = min(len(filtered_df) - 1, current_index + 1) | |
| generated_content, original_content, status, prev_btn, next_btn = update_display(new_index, predicate) | |
| return generated_content, original_content, status, new_index, prev_btn, next_btn | |
| def on_filter_change(current_index, predicate): | |
| """Handle filter changes - reset to first item""" | |
| generated_content, original_content, status, prev_btn, next_btn = update_display(0, predicate) | |
| return generated_content, original_content, status, 0, prev_btn, next_btn | |
| def update_predicate(predicate, key, value): | |
| """Update a single key in the predicate""" | |
| new_predicate = predicate.copy() | |
| new_predicate[key] = value | |
| return new_predicate | |
| # Create Gradio interface | |
| with gr.Blocks(title="BigCodeBench Problem Inspector") as demo: | |
| gr.Markdown("# BigCodeBench-MultiPL Problem Inspector") | |
| # State to track current index and predicate | |
| current_index = gr.State(0) | |
| predicate = gr.State({ | |
| 'filter_timeout': False, | |
| 'filter_successes': True, | |
| 'filter_errors': False, | |
| 'show_reasoning': False | |
| }) | |
| # Top controls row | |
| with gr.Row(): | |
| prev_btn = gr.Button("← Previous", size="sm") | |
| status_text = gr.Textbox(value="1 / 1", interactive=False, container=False, show_label=False) | |
| next_btn = gr.Button("Next →", size="sm") | |
| # Filter controls | |
| with gr.Row(): | |
| filter_timeout = gr.Checkbox(label="Filter by timeout = True", value=False) | |
| filter_successes = gr.Checkbox(label="Show successes (exit_code == 0)", value=True) | |
| filter_errors = gr.Checkbox(label="Show errors (exit_code != 0)", value=False) | |
| show_reasoning = gr.Checkbox(label="Show reasoning", value=False) | |
| # Main content area - two columns | |
| with gr.Row(): | |
| with gr.Column(): | |
| generated_display = gr.Markdown(value="Loading generated content...", height=600) | |
| with gr.Column(): | |
| original_display = gr.Markdown(value="Loading original content...", height=600) | |
| # Initialize display | |
| demo.load( | |
| fn=lambda: update_display(0, {'filter_timeout': False, 'filter_successes': True, 'filter_errors': False, 'show_reasoning': False}), | |
| outputs=[generated_display, original_display, status_text, prev_btn, next_btn] | |
| ) | |
| # Event handlers | |
| prev_btn.click( | |
| fn=go_prev, | |
| inputs=[current_index, predicate], | |
| outputs=[generated_display, original_display, status_text, current_index, prev_btn, next_btn] | |
| ) | |
| next_btn.click( | |
| fn=go_next, | |
| inputs=[current_index, predicate], | |
| outputs=[generated_display, original_display, status_text, current_index, prev_btn, next_btn] | |
| ) | |
| # Filter change handlers | |
| filter_timeout.change( | |
| fn=lambda current_idx, pred, value: ( | |
| *on_filter_change(current_idx, update_predicate(pred, 'filter_timeout', value)), | |
| update_predicate(pred, 'filter_timeout', value) | |
| ), | |
| inputs=[current_index, predicate, filter_timeout], | |
| outputs=[generated_display, original_display, status_text, current_index, prev_btn, next_btn, predicate] | |
| ) | |
| filter_errors.change( | |
| fn=lambda current_idx, pred, value: ( | |
| *on_filter_change(current_idx, update_predicate(pred, 'filter_errors', value)), | |
| update_predicate(pred, 'filter_errors', value) | |
| ), | |
| inputs=[current_index, predicate, filter_errors], | |
| outputs=[generated_display, original_display, status_text, current_index, prev_btn, next_btn, predicate] | |
| ) | |
| filter_successes.change( | |
| fn=lambda current_idx, pred, value: ( | |
| *on_filter_change(current_idx, update_predicate(pred, 'filter_successes', value)), | |
| update_predicate(pred, 'filter_successes', value) | |
| ), | |
| inputs=[current_index, predicate, filter_successes], | |
| outputs=[generated_display, original_display, status_text, current_index, prev_btn, next_btn, predicate] | |
| ) | |
| show_reasoning.change( | |
| fn=lambda current_idx, pred, value: ( | |
| *update_display(current_idx, update_predicate(pred, 'show_reasoning', value)), | |
| update_predicate(pred, 'show_reasoning', value) | |
| ), | |
| inputs=[current_index, predicate, show_reasoning], | |
| outputs=[generated_display, original_display, status_text, prev_btn, next_btn, predicate] | |
| ) | |
| demo.launch(share=True) | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| subparsers = parser.add_subparsers(dest="subcommand") | |
| upload_command = subparsers.add_parser("upload", help="Prepare the dataset") | |
| upload_command.add_argument( | |
| "--problems-path", | |
| type=Path, | |
| required=True, | |
| help="Output from make_stdio_problem.py", | |
| ) | |
| upload_command.add_argument( | |
| "--results-path", | |
| type=Path, | |
| required=True, | |
| help="Execution results from --problems-path", | |
| ) | |
| upload_command.add_argument( | |
| "--output-path", | |
| type=Path, | |
| required=True, | |
| help="Output path to save the joined dataset", | |
| ) | |
| dataset_inspector_command = subparsers.add_parser("dataset-inspector", help="Inspect a dataset") | |
| dataset_inspector_command.add_argument( | |
| "--dataset-name", | |
| type=str, | |
| default="nuprl/BigCodeBench-MultiPL-Results", | |
| help="Name of the dataset on the Hugging Face Hub", | |
| ) | |
| dataset_inspector_command.add_argument( | |
| "--data-dir", | |
| type=str, | |
| default="python_stdio", | |
| help="Name of the directory on the Hugging Face Hub", | |
| ) | |
| args = parser.parse_args() | |
| args_dict = dict(vars(args)) | |
| del args_dict["subcommand"] | |
| if args.subcommand == "upload": | |
| upload(**args_dict) | |
| elif args.subcommand == "dataset-inspector": | |
| dataset_inspector(**args_dict) | |
| elif args.subcommand is None: | |
| dataset_inspector(dataset_name="nuprl/BigCodeBench-MultiPL-Results", data_dir="python_stdio") | |
| else: | |
| raise ValueError(f"Unknown subcommand: {args.subcommand}") | |
| if __name__ == "__main__": | |
| main() | |