kshitijthakkar commited on
Commit
e8281aa
·
1 Parent(s): 68c1485

feat: Add prompt template support to push_dataset_to_hub

Browse files

- Added optional prompt_template parameter to push_dataset_to_hub MCP tool
- Automatically includes prompt template in dataset card README when provided
- Creates comprehensive dataset card with usage instructions
- Supports both single agent type and agent_type='both' (generates both templates)
- Updated app.py wrapper function to accept prompt_template parameter
- Enables complete end-to-end workflow: generate dataset + template + push with template included

Files changed (2) hide show
  1. app.py +4 -3
  2. mcp_tools.py +102 -4
app.py CHANGED
@@ -772,15 +772,16 @@ def create_gradio_ui():
772
  ```
773
  """)
774
 
775
- async def run_push_dataset(dataset_json, repo_name, hf_token, private):
776
- """Push dataset to hub with async support."""
777
  try:
778
  import json
779
  result = await push_dataset_to_hub(
780
  dataset_json=dataset_json,
781
  repo_name=repo_name,
782
  hf_token=hf_token,
783
- private=private
 
784
  )
785
  return json.loads(result)
786
  except Exception as e:
 
772
  ```
773
  """)
774
 
775
+ async def run_push_dataset(dataset_json, repo_name, hf_token, private, prompt_template=""):
776
+ """Push dataset to hub with async support and optional prompt template."""
777
  try:
778
  import json
779
  result = await push_dataset_to_hub(
780
  dataset_json=dataset_json,
781
  repo_name=repo_name,
782
  hf_token=hf_token,
783
+ private=private,
784
+ prompt_template=prompt_template if prompt_template else None
785
  )
786
  return json.loads(result)
787
  except Exception as e:
mcp_tools.py CHANGED
@@ -1769,13 +1769,15 @@ async def push_dataset_to_hub(
1769
  dataset_json: str,
1770
  repo_name: str,
1771
  hf_token: str,
1772
- private: bool = False
 
1773
  ) -> str:
1774
  """
1775
- Push a generated synthetic dataset to HuggingFace Hub.
1776
 
1777
  This tool uploads datasets created by generate_synthetic_dataset (or any SMOLTRACE-format
1778
  dataset) to HuggingFace Hub, making them ready for use in SMOLTRACE evaluations.
 
1779
 
1780
  **Naming Convention**: Repo name should follow SMOLTRACE convention:
1781
  - Format: {username}/smoltrace-{domain}-tasks or {username}/smoltrace-{domain}-tasks-v{version}
@@ -1788,6 +1790,7 @@ async def push_dataset_to_hub(
1788
  repo_name (str): HuggingFace repository name following SMOLTRACE naming: {username}/smoltrace-{domain}-tasks
1789
  hf_token (str): HuggingFace API token with write permissions (get from https://huggingface.co/settings/tokens)
1790
  private (bool): Whether to create a private dataset. Default: False (public)
 
1791
 
1792
  Returns:
1793
  str: JSON response with upload status, dataset URL, and next steps
@@ -1843,19 +1846,114 @@ async def push_dataset_to_hub(
1843
  private=private
1844
  )
1845
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1846
  # Return success response
1847
  result = {
1848
  "status": "success",
1849
- "message": f"Successfully uploaded {len(tasks)} tasks to HuggingFace Hub",
1850
  "dataset_info": {
1851
  "repository": repo_name,
1852
  "num_tasks": len(tasks),
1853
  "visibility": "private" if private else "public",
1854
- "dataset_url": f"https://huggingface.co/datasets/{repo_name}"
 
1855
  },
1856
  "next_steps": {
1857
  "view_dataset": f"https://huggingface.co/datasets/{repo_name}",
1858
  "use_in_smoltrace": f"smoltrace-eval --model openai/gpt-4 --dataset-name {repo_name}",
 
1859
  "share_with_team": f"Team members can access at https://huggingface.co/datasets/{repo_name}" if not private else "Dataset is private - share access via HuggingFace settings"
1860
  }
1861
  }
 
1769
  dataset_json: str,
1770
  repo_name: str,
1771
  hf_token: str,
1772
+ private: bool = False,
1773
+ prompt_template: str = None
1774
  ) -> str:
1775
  """
1776
+ Push a generated synthetic dataset to HuggingFace Hub with optional prompt template.
1777
 
1778
  This tool uploads datasets created by generate_synthetic_dataset (or any SMOLTRACE-format
1779
  dataset) to HuggingFace Hub, making them ready for use in SMOLTRACE evaluations.
1780
+ Optionally includes a customized prompt template in the dataset card.
1781
 
1782
  **Naming Convention**: Repo name should follow SMOLTRACE convention:
1783
  - Format: {username}/smoltrace-{domain}-tasks or {username}/smoltrace-{domain}-tasks-v{version}
 
1790
  repo_name (str): HuggingFace repository name following SMOLTRACE naming: {username}/smoltrace-{domain}-tasks
1791
  hf_token (str): HuggingFace API token with write permissions (get from https://huggingface.co/settings/tokens)
1792
  private (bool): Whether to create a private dataset. Default: False (public)
1793
+ prompt_template (str): Optional YAML prompt template to include in dataset card (from generate_prompt_template)
1794
 
1795
  Returns:
1796
  str: JSON response with upload status, dataset URL, and next steps
 
1846
  private=private
1847
  )
1848
 
1849
+ # If prompt template provided, add it to the dataset card
1850
+ if prompt_template and prompt_template.strip():
1851
+ try:
1852
+ print(f"[PUSH_DATASET_TO_HUB] Adding prompt template to dataset card...")
1853
+
1854
+ # Create enhanced README with prompt template
1855
+ readme_content = f"""---
1856
+ tags:
1857
+ - smoltrace
1858
+ - synthetic-data
1859
+ - agent-evaluation
1860
+ - mcp-generated
1861
+ license: mit
1862
+ ---
1863
+
1864
+ # SMOLTRACE Synthetic Dataset
1865
+
1866
+ This dataset was generated using the TraceMind MCP Server's synthetic data generation tools.
1867
+
1868
+ ## Dataset Info
1869
+
1870
+ - **Tasks**: {len(tasks)}
1871
+ - **Format**: SMOLTRACE evaluation format
1872
+ - **Generated**: AI-powered synthetic task generation
1873
+
1874
+ ## Usage with SMOLTRACE
1875
+
1876
+ ```python
1877
+ from datasets import load_dataset
1878
+
1879
+ # Load dataset
1880
+ dataset = load_dataset("{repo_name}")
1881
+
1882
+ # Use with SMOLTRACE
1883
+ # smoltrace-eval --model openai/gpt-4 --dataset-name {repo_name}
1884
+ ```
1885
+
1886
+ ## Prompt Template
1887
+
1888
+ This dataset includes a customized agent prompt template optimized for the domain and tools used.
1889
+
1890
+ ### Template File
1891
+
1892
+ Save the following as `prompt_template.yaml`:
1893
+
1894
+ ```yaml
1895
+ {prompt_template}
1896
+ ```
1897
+
1898
+ ### Using the Template
1899
+
1900
+ ```python
1901
+ from smolagents import ToolCallingAgent # or CodeAgent
1902
+
1903
+ agent = ToolCallingAgent(
1904
+ tools=[...], # Your tools
1905
+ model="openai/gpt-4",
1906
+ system_prompt_path="prompt_template.yaml"
1907
+ )
1908
+ ```
1909
+
1910
+ ## Dataset Structure
1911
+
1912
+ Each task contains:
1913
+ - `id`: Unique task identifier
1914
+ - `prompt`: Task description
1915
+ - `expected_tool`: Tool the agent should use
1916
+ - `difficulty`: Task complexity (easy/medium/hard)
1917
+ - `agent_type`: Type of agent (tool/code)
1918
+
1919
+ ## Generated with TraceMind MCP Server
1920
+
1921
+ 🔗 [TraceMind MCP Server](https://huggingface.co/spaces/MCP-1st-Birthday/TraceMind-mcp-server)
1922
+
1923
+ Part of the MCP's 1st Birthday Hackathon project.
1924
+ """
1925
+
1926
+ # Upload README to dataset repository
1927
+ api = HfApi()
1928
+ api.upload_file(
1929
+ path_or_fileobj=readme_content.encode('utf-8'),
1930
+ path_in_repo="README.md",
1931
+ repo_id=repo_name,
1932
+ repo_type="dataset",
1933
+ token=hf_token
1934
+ )
1935
+
1936
+ print(f"[PUSH_DATASET_TO_HUB] Prompt template added to dataset card successfully")
1937
+
1938
+ except Exception as readme_error:
1939
+ print(f"[WARNING] Failed to add prompt template to README: {readme_error}")
1940
+ # Don't fail the whole operation if README update fails
1941
+
1942
  # Return success response
1943
  result = {
1944
  "status": "success",
1945
+ "message": f"Successfully uploaded {len(tasks)} tasks to HuggingFace Hub" + (" with prompt template" if prompt_template else ""),
1946
  "dataset_info": {
1947
  "repository": repo_name,
1948
  "num_tasks": len(tasks),
1949
  "visibility": "private" if private else "public",
1950
+ "dataset_url": f"https://huggingface.co/datasets/{repo_name}",
1951
+ "includes_prompt_template": bool(prompt_template)
1952
  },
1953
  "next_steps": {
1954
  "view_dataset": f"https://huggingface.co/datasets/{repo_name}",
1955
  "use_in_smoltrace": f"smoltrace-eval --model openai/gpt-4 --dataset-name {repo_name}",
1956
+ "use_prompt_template": "Check the README.md for the customized prompt template" if prompt_template else "Generate a prompt template using generate_prompt_template tool",
1957
  "share_with_team": f"Team members can access at https://huggingface.co/datasets/{repo_name}" if not private else "Dataset is private - share access via HuggingFace settings"
1958
  }
1959
  }