AIencoder's picture
Create app.py
86d9753 verified
import gradio as gr
# Template definitions
TEMPLATES = {
"ChatML (Qwen, OpenHermes, etc.)": {
"system_start": "<|im_start|>system\n",
"system_end": "<|im_end|>\n",
"user_start": "<|im_start|>user\n",
"user_end": "<|im_end|>\n",
"assistant_start": "<|im_start|>assistant\n",
"assistant_end": "<|im_end|>\n",
"bos": "",
"eos": "",
},
"Llama 2 / Mistral": {
"system_start": "[INST] <<SYS>>\n",
"system_end": "\n<</SYS>>\n\n",
"user_start": "",
"user_end": " [/INST] ",
"assistant_start": "",
"assistant_end": "</s><s>[INST] ",
"bos": "<s>",
"eos": "</s>",
"first_user_start": "[INST] ", # When no system prompt
},
"Llama 3 / Llama 3.1": {
"system_start": "<|start_header_id|>system<|end_header_id|>\n\n",
"system_end": "<|eot_id|>",
"user_start": "<|start_header_id|>user<|end_header_id|>\n\n",
"user_end": "<|eot_id|>",
"assistant_start": "<|start_header_id|>assistant<|end_header_id|>\n\n",
"assistant_end": "<|eot_id|>",
"bos": "<|begin_of_text|>",
"eos": "",
},
"Alpaca": {
"system_start": "",
"system_end": "",
"user_start": "### Instruction:\n",
"user_end": "\n\n",
"input_start": "### Input:\n",
"input_end": "\n\n",
"assistant_start": "### Response:\n",
"assistant_end": "\n\n",
"bos": "",
"eos": "",
},
"Vicuna": {
"system_start": "",
"system_end": "\n\n",
"user_start": "USER: ",
"user_end": "\n",
"assistant_start": "ASSISTANT: ",
"assistant_end": "</s>\n",
"bos": "",
"eos": "",
},
"Gemma": {
"system_start": "",
"system_end": "",
"user_start": "<start_of_turn>user\n",
"user_end": "<end_of_turn>\n",
"assistant_start": "<start_of_turn>model\n",
"assistant_end": "<end_of_turn>\n",
"bos": "<bos>",
"eos": "",
},
"Phi-3": {
"system_start": "<|system|>\n",
"system_end": "<|end|>\n",
"user_start": "<|user|>\n",
"user_end": "<|end|>\n",
"assistant_start": "<|assistant|>\n",
"assistant_end": "<|end|>\n",
"bos": "",
"eos": "",
},
"Zephyr": {
"system_start": "<|system|>\n",
"system_end": "</s>\n",
"user_start": "<|user|>\n",
"user_end": "</s>\n",
"assistant_start": "<|assistant|>\n",
"assistant_end": "</s>\n",
"bos": "",
"eos": "",
},
"Command-R": {
"system_start": "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>",
"system_end": "<|END_OF_TURN_TOKEN|>",
"user_start": "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>",
"user_end": "<|END_OF_TURN_TOKEN|>",
"assistant_start": "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
"assistant_end": "<|END_OF_TURN_TOKEN|>",
"bos": "<BOS_TOKEN>",
"eos": "",
},
"DeepSeek": {
"system_start": "",
"system_end": "\n\n",
"user_start": "User: ",
"user_end": "\n\n",
"assistant_start": "Assistant: ",
"assistant_end": "<|end▁of▁sentence|>",
"bos": "<|begin▁of▁sentence|>",
"eos": "",
},
"Raw (No Template)": {
"system_start": "System: ",
"system_end": "\n\n",
"user_start": "User: ",
"user_end": "\n\n",
"assistant_start": "Assistant: ",
"assistant_end": "\n\n",
"bos": "",
"eos": "",
},
}
# Model to template mapping
MODEL_TEMPLATES = {
"Qwen/Qwen2.5-7B-Instruct": "ChatML (Qwen, OpenHermes, etc.)",
"Qwen/Qwen2.5-Coder-7B-Instruct": "ChatML (Qwen, OpenHermes, etc.)",
"teknium/OpenHermes-2.5-Mistral-7B": "ChatML (Qwen, OpenHermes, etc.)",
"NousResearch/Hermes-2-Pro-Mistral-7B": "ChatML (Qwen, OpenHermes, etc.)",
"mistralai/Mistral-7B-Instruct-v0.2": "Llama 2 / Mistral",
"mistralai/Mistral-7B-Instruct-v0.3": "Llama 2 / Mistral",
"meta-llama/Llama-2-7b-chat-hf": "Llama 2 / Mistral",
"meta-llama/Meta-Llama-3-8B-Instruct": "Llama 3 / Llama 3.1",
"meta-llama/Meta-Llama-3.1-8B-Instruct": "Llama 3 / Llama 3.1",
"google/gemma-1.1-7b-it": "Gemma",
"google/gemma-2-9b-it": "Gemma",
"microsoft/Phi-3-mini-4k-instruct": "Phi-3",
"HuggingFaceH4/zephyr-7b-beta": "Zephyr",
"CohereForAI/c4ai-command-r-v01": "Command-R",
"deepseek-ai/deepseek-coder-7b-instruct-v1.5": "DeepSeek",
}
def format_prompt(template_name, system_msg, user_msg, assistant_msg, include_generation_prompt):
"""Format messages using the selected template"""
if template_name not in TEMPLATES:
return "Template not found"
t = TEMPLATES[template_name]
prompt = ""
# Add BOS token
if t.get("bos"):
prompt += t["bos"]
# Add system message
if system_msg.strip():
prompt += t["system_start"] + system_msg.strip() + t["system_end"]
# Add user message
if user_msg.strip():
# Special case for Llama 2 without system prompt
if template_name == "Llama 2 / Mistral" and not system_msg.strip():
prompt += t.get("first_user_start", t["user_start"])
else:
prompt += t["user_start"]
prompt += user_msg.strip() + t["user_end"]
# Add assistant message (if provided, for multi-turn)
if assistant_msg.strip():
prompt += t["assistant_start"] + assistant_msg.strip() + t["assistant_end"]
# Add generation prompt (assistant start token)
if include_generation_prompt:
prompt += t["assistant_start"]
return prompt
def format_multi_turn(template_name, conversation_text, include_generation_prompt):
"""Format multi-turn conversation"""
if template_name not in TEMPLATES:
return "Template not found"
t = TEMPLATES[template_name]
prompt = ""
# Add BOS token
if t.get("bos"):
prompt += t["bos"]
# Parse conversation
lines = conversation_text.strip().split("\n")
system_msg = ""
messages = []
for line in lines:
line = line.strip()
if not line:
continue
if line.lower().startswith("system:"):
system_msg = line[7:].strip()
elif line.lower().startswith("user:"):
messages.append(("user", line[5:].strip()))
elif line.lower().startswith("assistant:"):
messages.append(("assistant", line[10:].strip()))
# Add system message
if system_msg:
prompt += t["system_start"] + system_msg + t["system_end"]
# Add conversation turns
for i, (role, content) in enumerate(messages):
if role == "user":
# Special case for Llama 2 first user without system
if template_name == "Llama 2 / Mistral" and i == 0 and not system_msg:
prompt += t.get("first_user_start", t["user_start"])
else:
prompt += t["user_start"]
prompt += content + t["user_end"]
elif role == "assistant":
prompt += t["assistant_start"] + content + t["assistant_end"]
# Add generation prompt
if include_generation_prompt:
prompt += t["assistant_start"]
return prompt
def get_template_from_model(model_name):
"""Get template name from model"""
if model_name in MODEL_TEMPLATES:
return MODEL_TEMPLATES[model_name]
# Try to guess from model name
model_lower = model_name.lower()
if "qwen" in model_lower or "hermes" in model_lower:
return "ChatML (Qwen, OpenHermes, etc.)"
elif "llama-3" in model_lower or "llama3" in model_lower:
return "Llama 3 / Llama 3.1"
elif "llama-2" in model_lower or "mistral" in model_lower:
return "Llama 2 / Mistral"
elif "gemma" in model_lower:
return "Gemma"
elif "phi" in model_lower:
return "Phi-3"
elif "zephyr" in model_lower:
return "Zephyr"
elif "deepseek" in model_lower:
return "DeepSeek"
return "ChatML (Qwen, OpenHermes, etc.)"
def generate_code_snippet(template_name, system_msg, user_msg):
"""Generate Python code for using this template"""
code = f'''from transformers import AutoTokenizer, AutoModelForCausalLM
model_id = "YOUR_MODEL_HERE"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
messages = [
{{"role": "system", "content": """{system_msg}"""}},
{{"role": "user", "content": """{user_msg}"""}}
]
# Apply chat template
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Generate
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=256)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
'''
return code
def generate_llama_cpp_snippet(formatted_prompt):
"""Generate llama.cpp compatible code"""
# Escape the prompt for Python string
escaped = formatted_prompt.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n')
code = f'''# For llama-cpp-python
from llama_cpp import Llama
llm = Llama(model_path="your_model.gguf", n_ctx=4096)
prompt = """{formatted_prompt}"""
output = llm(
prompt,
max_tokens=256,
stop=["<|im_end|>", "</s>", "<|eot_id|>"], # Adjust based on template
echo=False
)
print(output["choices"][0]["text"])
'''
return code
# ============== GRADIO UI ==============
with gr.Blocks(title="πŸ“ Prompt Template Tester", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸ“ Prompt Template Tester
See exactly how your prompt gets formatted for different models.
Stop guessing why your model outputs garbage!
**By [AIencoder](https://huggingface.co/AIencoder)** πŸš€
""")
with gr.Tabs():
with gr.TabItem("πŸ’¬ Single Turn"):
with gr.Row():
with gr.Column():
template_dropdown = gr.Dropdown(
label="Template Format",
choices=list(TEMPLATES.keys()),
value="ChatML (Qwen, OpenHermes, etc.)"
)
model_lookup = gr.Textbox(
label="Or enter model name to auto-detect",
placeholder="Qwen/Qwen2.5-7B-Instruct"
)
detect_btn = gr.Button("πŸ” Detect Template", variant="secondary", size="sm")
system_input = gr.Textbox(
label="System Message",
placeholder="You are a helpful coding assistant.",
lines=2
)
user_input = gr.Textbox(
label="User Message",
placeholder="Write a Python function to reverse a string",
lines=3
)
assistant_input = gr.Textbox(
label="Assistant Message (optional, for multi-turn)",
placeholder="Here's a function to reverse a string...",
lines=2
)
gen_prompt_checkbox = gr.Checkbox(
label="Include generation prompt (assistant start token)",
value=True
)
format_btn = gr.Button("πŸš€ Format Prompt", variant="primary")
with gr.Column():
formatted_output = gr.Code(
label="Formatted Prompt",
language=None,
lines=15
)
char_count = gr.Markdown("")
with gr.Accordion("πŸ“‹ Python Code Snippet", open=False):
code_output = gr.Code(label="Transformers Code", language="python", lines=15)
with gr.Accordion("πŸ¦™ llama.cpp Code Snippet", open=False):
llama_output = gr.Code(label="llama-cpp-python Code", language="python", lines=12)
with gr.TabItem("πŸ”„ Multi-Turn"):
with gr.Row():
with gr.Column():
multi_template = gr.Dropdown(
label="Template Format",
choices=list(TEMPLATES.keys()),
value="ChatML (Qwen, OpenHermes, etc.)"
)
conversation_input = gr.Textbox(
label="Conversation (one message per line)",
placeholder="""System: You are a helpful assistant
User: Hello!
Assistant: Hi there! How can I help?
User: What's 2+2?""",
lines=10
)
multi_gen_prompt = gr.Checkbox(
label="Include generation prompt",
value=True
)
multi_format_btn = gr.Button("πŸš€ Format Conversation", variant="primary")
with gr.Column():
multi_output = gr.Code(
label="Formatted Conversation",
language=None,
lines=20
)
with gr.TabItem("πŸ“š Template Reference"):
gr.Markdown("""
## Template Formats
| Template | Used By | Special Tokens |
|----------|---------|----------------|
| **ChatML** | Qwen, OpenHermes, Nous Hermes | `<\|im_start\|>`, `<\|im_end\|>` |
| **Llama 2 / Mistral** | Llama 2, Mistral v0.1-0.3 | `[INST]`, `[/INST]`, `<<SYS>>` |
| **Llama 3** | Llama 3, Llama 3.1 | `<\|start_header_id\|>`, `<\|eot_id\|>` |
| **Alpaca** | Alpaca-style models | `### Instruction:`, `### Response:` |
| **Vicuna** | Vicuna models | `USER:`, `ASSISTANT:` |
| **Gemma** | Google Gemma | `<start_of_turn>`, `<end_of_turn>` |
| **Phi-3** | Microsoft Phi-3 | `<\|system\|>`, `<\|user\|>`, `<\|assistant\|>` |
| **Zephyr** | Zephyr models | `<\|system\|>`, `</s>` |
## Common Mistakes
❌ **Using wrong template** β†’ Model outputs garbage or repeats prompt
❌ **Missing generation prompt** β†’ Model doesn't know to start generating
❌ **Wrong stop tokens** β†’ Model generates forever or stops too early
❌ **System prompt in wrong place** β†’ Model ignores instructions
## Tips
βœ… Always check the model card for the correct template
βœ… Use `tokenizer.apply_chat_template()` when possible
βœ… Test your prompts here before running expensive inference!
""")
# Event handlers
def update_char_count(text):
return f"**Length:** {len(text)} characters, ~{len(text)//4} tokens"
def on_format(template, system, user, assistant, gen_prompt):
formatted = format_prompt(template, system, user, assistant, gen_prompt)
code = generate_code_snippet(template, system, user)
llama = generate_llama_cpp_snippet(formatted)
count = update_char_count(formatted)
return formatted, count, code, llama
def on_detect(model_name):
return get_template_from_model(model_name)
detect_btn.click(
fn=on_detect,
inputs=[model_lookup],
outputs=[template_dropdown]
)
format_btn.click(
fn=on_format,
inputs=[template_dropdown, system_input, user_input, assistant_input, gen_prompt_checkbox],
outputs=[formatted_output, char_count, code_output, llama_output]
)
multi_format_btn.click(
fn=format_multi_turn,
inputs=[multi_template, conversation_input, multi_gen_prompt],
outputs=[multi_output]
)
demo.launch(server_name="0.0.0.0", server_port=7860)