Commit
·
e8281aa
1
Parent(s):
68c1485
feat: Add prompt template support to push_dataset_to_hub
Browse files- Added optional prompt_template parameter to push_dataset_to_hub MCP tool
- Automatically includes prompt template in dataset card README when provided
- Creates comprehensive dataset card with usage instructions
- Supports both single agent type and agent_type='both' (generates both templates)
- Updated app.py wrapper function to accept prompt_template parameter
- Enables complete end-to-end workflow: generate dataset + template + push with template included
- app.py +4 -3
- mcp_tools.py +102 -4
app.py
CHANGED
|
@@ -772,15 +772,16 @@ def create_gradio_ui():
|
|
| 772 |
```
|
| 773 |
""")
|
| 774 |
|
| 775 |
-
async def run_push_dataset(dataset_json, repo_name, hf_token, private):
|
| 776 |
-
"""Push dataset to hub with async support."""
|
| 777 |
try:
|
| 778 |
import json
|
| 779 |
result = await push_dataset_to_hub(
|
| 780 |
dataset_json=dataset_json,
|
| 781 |
repo_name=repo_name,
|
| 782 |
hf_token=hf_token,
|
| 783 |
-
private=private
|
|
|
|
| 784 |
)
|
| 785 |
return json.loads(result)
|
| 786 |
except Exception as e:
|
|
|
|
| 772 |
```
|
| 773 |
""")
|
| 774 |
|
| 775 |
+
async def run_push_dataset(dataset_json, repo_name, hf_token, private, prompt_template=""):
|
| 776 |
+
"""Push dataset to hub with async support and optional prompt template."""
|
| 777 |
try:
|
| 778 |
import json
|
| 779 |
result = await push_dataset_to_hub(
|
| 780 |
dataset_json=dataset_json,
|
| 781 |
repo_name=repo_name,
|
| 782 |
hf_token=hf_token,
|
| 783 |
+
private=private,
|
| 784 |
+
prompt_template=prompt_template if prompt_template else None
|
| 785 |
)
|
| 786 |
return json.loads(result)
|
| 787 |
except Exception as e:
|
mcp_tools.py
CHANGED
|
@@ -1769,13 +1769,15 @@ async def push_dataset_to_hub(
|
|
| 1769 |
dataset_json: str,
|
| 1770 |
repo_name: str,
|
| 1771 |
hf_token: str,
|
| 1772 |
-
private: bool = False
|
|
|
|
| 1773 |
) -> str:
|
| 1774 |
"""
|
| 1775 |
-
Push a generated synthetic dataset to HuggingFace Hub.
|
| 1776 |
|
| 1777 |
This tool uploads datasets created by generate_synthetic_dataset (or any SMOLTRACE-format
|
| 1778 |
dataset) to HuggingFace Hub, making them ready for use in SMOLTRACE evaluations.
|
|
|
|
| 1779 |
|
| 1780 |
**Naming Convention**: Repo name should follow SMOLTRACE convention:
|
| 1781 |
- Format: {username}/smoltrace-{domain}-tasks or {username}/smoltrace-{domain}-tasks-v{version}
|
|
@@ -1788,6 +1790,7 @@ async def push_dataset_to_hub(
|
|
| 1788 |
repo_name (str): HuggingFace repository name following SMOLTRACE naming: {username}/smoltrace-{domain}-tasks
|
| 1789 |
hf_token (str): HuggingFace API token with write permissions (get from https://huggingface.co/settings/tokens)
|
| 1790 |
private (bool): Whether to create a private dataset. Default: False (public)
|
|
|
|
| 1791 |
|
| 1792 |
Returns:
|
| 1793 |
str: JSON response with upload status, dataset URL, and next steps
|
|
@@ -1843,19 +1846,114 @@ async def push_dataset_to_hub(
|
|
| 1843 |
private=private
|
| 1844 |
)
|
| 1845 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1846 |
# Return success response
|
| 1847 |
result = {
|
| 1848 |
"status": "success",
|
| 1849 |
-
"message": f"Successfully uploaded {len(tasks)} tasks to HuggingFace Hub",
|
| 1850 |
"dataset_info": {
|
| 1851 |
"repository": repo_name,
|
| 1852 |
"num_tasks": len(tasks),
|
| 1853 |
"visibility": "private" if private else "public",
|
| 1854 |
-
"dataset_url": f"https://huggingface.co/datasets/{repo_name}"
|
|
|
|
| 1855 |
},
|
| 1856 |
"next_steps": {
|
| 1857 |
"view_dataset": f"https://huggingface.co/datasets/{repo_name}",
|
| 1858 |
"use_in_smoltrace": f"smoltrace-eval --model openai/gpt-4 --dataset-name {repo_name}",
|
|
|
|
| 1859 |
"share_with_team": f"Team members can access at https://huggingface.co/datasets/{repo_name}" if not private else "Dataset is private - share access via HuggingFace settings"
|
| 1860 |
}
|
| 1861 |
}
|
|
|
|
| 1769 |
dataset_json: str,
|
| 1770 |
repo_name: str,
|
| 1771 |
hf_token: str,
|
| 1772 |
+
private: bool = False,
|
| 1773 |
+
prompt_template: str = None
|
| 1774 |
) -> str:
|
| 1775 |
"""
|
| 1776 |
+
Push a generated synthetic dataset to HuggingFace Hub with optional prompt template.
|
| 1777 |
|
| 1778 |
This tool uploads datasets created by generate_synthetic_dataset (or any SMOLTRACE-format
|
| 1779 |
dataset) to HuggingFace Hub, making them ready for use in SMOLTRACE evaluations.
|
| 1780 |
+
Optionally includes a customized prompt template in the dataset card.
|
| 1781 |
|
| 1782 |
**Naming Convention**: Repo name should follow SMOLTRACE convention:
|
| 1783 |
- Format: {username}/smoltrace-{domain}-tasks or {username}/smoltrace-{domain}-tasks-v{version}
|
|
|
|
| 1790 |
repo_name (str): HuggingFace repository name following SMOLTRACE naming: {username}/smoltrace-{domain}-tasks
|
| 1791 |
hf_token (str): HuggingFace API token with write permissions (get from https://huggingface.co/settings/tokens)
|
| 1792 |
private (bool): Whether to create a private dataset. Default: False (public)
|
| 1793 |
+
prompt_template (str): Optional YAML prompt template to include in dataset card (from generate_prompt_template)
|
| 1794 |
|
| 1795 |
Returns:
|
| 1796 |
str: JSON response with upload status, dataset URL, and next steps
|
|
|
|
| 1846 |
private=private
|
| 1847 |
)
|
| 1848 |
|
| 1849 |
+
# If prompt template provided, add it to the dataset card
|
| 1850 |
+
if prompt_template and prompt_template.strip():
|
| 1851 |
+
try:
|
| 1852 |
+
print(f"[PUSH_DATASET_TO_HUB] Adding prompt template to dataset card...")
|
| 1853 |
+
|
| 1854 |
+
# Create enhanced README with prompt template
|
| 1855 |
+
readme_content = f"""---
|
| 1856 |
+
tags:
|
| 1857 |
+
- smoltrace
|
| 1858 |
+
- synthetic-data
|
| 1859 |
+
- agent-evaluation
|
| 1860 |
+
- mcp-generated
|
| 1861 |
+
license: mit
|
| 1862 |
+
---
|
| 1863 |
+
|
| 1864 |
+
# SMOLTRACE Synthetic Dataset
|
| 1865 |
+
|
| 1866 |
+
This dataset was generated using the TraceMind MCP Server's synthetic data generation tools.
|
| 1867 |
+
|
| 1868 |
+
## Dataset Info
|
| 1869 |
+
|
| 1870 |
+
- **Tasks**: {len(tasks)}
|
| 1871 |
+
- **Format**: SMOLTRACE evaluation format
|
| 1872 |
+
- **Generated**: AI-powered synthetic task generation
|
| 1873 |
+
|
| 1874 |
+
## Usage with SMOLTRACE
|
| 1875 |
+
|
| 1876 |
+
```python
|
| 1877 |
+
from datasets import load_dataset
|
| 1878 |
+
|
| 1879 |
+
# Load dataset
|
| 1880 |
+
dataset = load_dataset("{repo_name}")
|
| 1881 |
+
|
| 1882 |
+
# Use with SMOLTRACE
|
| 1883 |
+
# smoltrace-eval --model openai/gpt-4 --dataset-name {repo_name}
|
| 1884 |
+
```
|
| 1885 |
+
|
| 1886 |
+
## Prompt Template
|
| 1887 |
+
|
| 1888 |
+
This dataset includes a customized agent prompt template optimized for the domain and tools used.
|
| 1889 |
+
|
| 1890 |
+
### Template File
|
| 1891 |
+
|
| 1892 |
+
Save the following as `prompt_template.yaml`:
|
| 1893 |
+
|
| 1894 |
+
```yaml
|
| 1895 |
+
{prompt_template}
|
| 1896 |
+
```
|
| 1897 |
+
|
| 1898 |
+
### Using the Template
|
| 1899 |
+
|
| 1900 |
+
```python
|
| 1901 |
+
from smolagents import ToolCallingAgent # or CodeAgent
|
| 1902 |
+
|
| 1903 |
+
agent = ToolCallingAgent(
|
| 1904 |
+
tools=[...], # Your tools
|
| 1905 |
+
model="openai/gpt-4",
|
| 1906 |
+
system_prompt_path="prompt_template.yaml"
|
| 1907 |
+
)
|
| 1908 |
+
```
|
| 1909 |
+
|
| 1910 |
+
## Dataset Structure
|
| 1911 |
+
|
| 1912 |
+
Each task contains:
|
| 1913 |
+
- `id`: Unique task identifier
|
| 1914 |
+
- `prompt`: Task description
|
| 1915 |
+
- `expected_tool`: Tool the agent should use
|
| 1916 |
+
- `difficulty`: Task complexity (easy/medium/hard)
|
| 1917 |
+
- `agent_type`: Type of agent (tool/code)
|
| 1918 |
+
|
| 1919 |
+
## Generated with TraceMind MCP Server
|
| 1920 |
+
|
| 1921 |
+
🔗 [TraceMind MCP Server](https://huggingface.co/spaces/MCP-1st-Birthday/TraceMind-mcp-server)
|
| 1922 |
+
|
| 1923 |
+
Part of the MCP's 1st Birthday Hackathon project.
|
| 1924 |
+
"""
|
| 1925 |
+
|
| 1926 |
+
# Upload README to dataset repository
|
| 1927 |
+
api = HfApi()
|
| 1928 |
+
api.upload_file(
|
| 1929 |
+
path_or_fileobj=readme_content.encode('utf-8'),
|
| 1930 |
+
path_in_repo="README.md",
|
| 1931 |
+
repo_id=repo_name,
|
| 1932 |
+
repo_type="dataset",
|
| 1933 |
+
token=hf_token
|
| 1934 |
+
)
|
| 1935 |
+
|
| 1936 |
+
print(f"[PUSH_DATASET_TO_HUB] Prompt template added to dataset card successfully")
|
| 1937 |
+
|
| 1938 |
+
except Exception as readme_error:
|
| 1939 |
+
print(f"[WARNING] Failed to add prompt template to README: {readme_error}")
|
| 1940 |
+
# Don't fail the whole operation if README update fails
|
| 1941 |
+
|
| 1942 |
# Return success response
|
| 1943 |
result = {
|
| 1944 |
"status": "success",
|
| 1945 |
+
"message": f"Successfully uploaded {len(tasks)} tasks to HuggingFace Hub" + (" with prompt template" if prompt_template else ""),
|
| 1946 |
"dataset_info": {
|
| 1947 |
"repository": repo_name,
|
| 1948 |
"num_tasks": len(tasks),
|
| 1949 |
"visibility": "private" if private else "public",
|
| 1950 |
+
"dataset_url": f"https://huggingface.co/datasets/{repo_name}",
|
| 1951 |
+
"includes_prompt_template": bool(prompt_template)
|
| 1952 |
},
|
| 1953 |
"next_steps": {
|
| 1954 |
"view_dataset": f"https://huggingface.co/datasets/{repo_name}",
|
| 1955 |
"use_in_smoltrace": f"smoltrace-eval --model openai/gpt-4 --dataset-name {repo_name}",
|
| 1956 |
+
"use_prompt_template": "Check the README.md for the customized prompt template" if prompt_template else "Generate a prompt template using generate_prompt_template tool",
|
| 1957 |
"share_with_team": f"Team members can access at https://huggingface.co/datasets/{repo_name}" if not private else "Dataset is private - share access via HuggingFace settings"
|
| 1958 |
}
|
| 1959 |
}
|