import gradio as gr # Template definitions TEMPLATES = { "ChatML (Qwen, OpenHermes, etc.)": { "system_start": "<|im_start|>system\n", "system_end": "<|im_end|>\n", "user_start": "<|im_start|>user\n", "user_end": "<|im_end|>\n", "assistant_start": "<|im_start|>assistant\n", "assistant_end": "<|im_end|>\n", "bos": "", "eos": "", }, "Llama 2 / Mistral": { "system_start": "[INST] <>\n", "system_end": "\n<>\n\n", "user_start": "", "user_end": " [/INST] ", "assistant_start": "", "assistant_end": "[INST] ", "bos": "", "eos": "", "first_user_start": "[INST] ", # When no system prompt }, "Llama 3 / Llama 3.1": { "system_start": "<|start_header_id|>system<|end_header_id|>\n\n", "system_end": "<|eot_id|>", "user_start": "<|start_header_id|>user<|end_header_id|>\n\n", "user_end": "<|eot_id|>", "assistant_start": "<|start_header_id|>assistant<|end_header_id|>\n\n", "assistant_end": "<|eot_id|>", "bos": "<|begin_of_text|>", "eos": "", }, "Alpaca": { "system_start": "", "system_end": "", "user_start": "### Instruction:\n", "user_end": "\n\n", "input_start": "### Input:\n", "input_end": "\n\n", "assistant_start": "### Response:\n", "assistant_end": "\n\n", "bos": "", "eos": "", }, "Vicuna": { "system_start": "", "system_end": "\n\n", "user_start": "USER: ", "user_end": "\n", "assistant_start": "ASSISTANT: ", "assistant_end": "\n", "bos": "", "eos": "", }, "Gemma": { "system_start": "", "system_end": "", "user_start": "user\n", "user_end": "\n", "assistant_start": "model\n", "assistant_end": "\n", "bos": "", "eos": "", }, "Phi-3": { "system_start": "<|system|>\n", "system_end": "<|end|>\n", "user_start": "<|user|>\n", "user_end": "<|end|>\n", "assistant_start": "<|assistant|>\n", "assistant_end": "<|end|>\n", "bos": "", "eos": "", }, "Zephyr": { "system_start": "<|system|>\n", "system_end": "\n", "user_start": "<|user|>\n", "user_end": "\n", "assistant_start": "<|assistant|>\n", "assistant_end": "\n", "bos": "", "eos": "", }, "Command-R": { "system_start": "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>", "system_end": "<|END_OF_TURN_TOKEN|>", "user_start": "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>", "user_end": "<|END_OF_TURN_TOKEN|>", "assistant_start": "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", "assistant_end": "<|END_OF_TURN_TOKEN|>", "bos": "", "eos": "", }, "DeepSeek": { "system_start": "", "system_end": "\n\n", "user_start": "User: ", "user_end": "\n\n", "assistant_start": "Assistant: ", "assistant_end": "<|end▁of▁sentence|>", "bos": "<|begin▁of▁sentence|>", "eos": "", }, "Raw (No Template)": { "system_start": "System: ", "system_end": "\n\n", "user_start": "User: ", "user_end": "\n\n", "assistant_start": "Assistant: ", "assistant_end": "\n\n", "bos": "", "eos": "", }, } # Model to template mapping MODEL_TEMPLATES = { "Qwen/Qwen2.5-7B-Instruct": "ChatML (Qwen, OpenHermes, etc.)", "Qwen/Qwen2.5-Coder-7B-Instruct": "ChatML (Qwen, OpenHermes, etc.)", "teknium/OpenHermes-2.5-Mistral-7B": "ChatML (Qwen, OpenHermes, etc.)", "NousResearch/Hermes-2-Pro-Mistral-7B": "ChatML (Qwen, OpenHermes, etc.)", "mistralai/Mistral-7B-Instruct-v0.2": "Llama 2 / Mistral", "mistralai/Mistral-7B-Instruct-v0.3": "Llama 2 / Mistral", "meta-llama/Llama-2-7b-chat-hf": "Llama 2 / Mistral", "meta-llama/Meta-Llama-3-8B-Instruct": "Llama 3 / Llama 3.1", "meta-llama/Meta-Llama-3.1-8B-Instruct": "Llama 3 / Llama 3.1", "google/gemma-1.1-7b-it": "Gemma", "google/gemma-2-9b-it": "Gemma", "microsoft/Phi-3-mini-4k-instruct": "Phi-3", "HuggingFaceH4/zephyr-7b-beta": "Zephyr", "CohereForAI/c4ai-command-r-v01": "Command-R", "deepseek-ai/deepseek-coder-7b-instruct-v1.5": "DeepSeek", } def format_prompt(template_name, system_msg, user_msg, assistant_msg, include_generation_prompt): """Format messages using the selected template""" if template_name not in TEMPLATES: return "Template not found" t = TEMPLATES[template_name] prompt = "" # Add BOS token if t.get("bos"): prompt += t["bos"] # Add system message if system_msg.strip(): prompt += t["system_start"] + system_msg.strip() + t["system_end"] # Add user message if user_msg.strip(): # Special case for Llama 2 without system prompt if template_name == "Llama 2 / Mistral" and not system_msg.strip(): prompt += t.get("first_user_start", t["user_start"]) else: prompt += t["user_start"] prompt += user_msg.strip() + t["user_end"] # Add assistant message (if provided, for multi-turn) if assistant_msg.strip(): prompt += t["assistant_start"] + assistant_msg.strip() + t["assistant_end"] # Add generation prompt (assistant start token) if include_generation_prompt: prompt += t["assistant_start"] return prompt def format_multi_turn(template_name, conversation_text, include_generation_prompt): """Format multi-turn conversation""" if template_name not in TEMPLATES: return "Template not found" t = TEMPLATES[template_name] prompt = "" # Add BOS token if t.get("bos"): prompt += t["bos"] # Parse conversation lines = conversation_text.strip().split("\n") system_msg = "" messages = [] for line in lines: line = line.strip() if not line: continue if line.lower().startswith("system:"): system_msg = line[7:].strip() elif line.lower().startswith("user:"): messages.append(("user", line[5:].strip())) elif line.lower().startswith("assistant:"): messages.append(("assistant", line[10:].strip())) # Add system message if system_msg: prompt += t["system_start"] + system_msg + t["system_end"] # Add conversation turns for i, (role, content) in enumerate(messages): if role == "user": # Special case for Llama 2 first user without system if template_name == "Llama 2 / Mistral" and i == 0 and not system_msg: prompt += t.get("first_user_start", t["user_start"]) else: prompt += t["user_start"] prompt += content + t["user_end"] elif role == "assistant": prompt += t["assistant_start"] + content + t["assistant_end"] # Add generation prompt if include_generation_prompt: prompt += t["assistant_start"] return prompt def get_template_from_model(model_name): """Get template name from model""" if model_name in MODEL_TEMPLATES: return MODEL_TEMPLATES[model_name] # Try to guess from model name model_lower = model_name.lower() if "qwen" in model_lower or "hermes" in model_lower: return "ChatML (Qwen, OpenHermes, etc.)" elif "llama-3" in model_lower or "llama3" in model_lower: return "Llama 3 / Llama 3.1" elif "llama-2" in model_lower or "mistral" in model_lower: return "Llama 2 / Mistral" elif "gemma" in model_lower: return "Gemma" elif "phi" in model_lower: return "Phi-3" elif "zephyr" in model_lower: return "Zephyr" elif "deepseek" in model_lower: return "DeepSeek" return "ChatML (Qwen, OpenHermes, etc.)" def generate_code_snippet(template_name, system_msg, user_msg): """Generate Python code for using this template""" code = f'''from transformers import AutoTokenizer, AutoModelForCausalLM model_id = "YOUR_MODEL_HERE" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id) messages = [ {{"role": "system", "content": """{system_msg}"""}}, {{"role": "user", "content": """{user_msg}"""}} ] # Apply chat template prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Generate inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=256) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) ''' return code def generate_llama_cpp_snippet(formatted_prompt): """Generate llama.cpp compatible code""" # Escape the prompt for Python string escaped = formatted_prompt.replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n') code = f'''# For llama-cpp-python from llama_cpp import Llama llm = Llama(model_path="your_model.gguf", n_ctx=4096) prompt = """{formatted_prompt}""" output = llm( prompt, max_tokens=256, stop=["<|im_end|>", "", "<|eot_id|>"], # Adjust based on template echo=False ) print(output["choices"][0]["text"]) ''' return code # ============== GRADIO UI ============== with gr.Blocks(title="📝 Prompt Template Tester", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 📝 Prompt Template Tester See exactly how your prompt gets formatted for different models. Stop guessing why your model outputs garbage! **By [AIencoder](https://huggingface.co/AIencoder)** 🚀 """) with gr.Tabs(): with gr.TabItem("💬 Single Turn"): with gr.Row(): with gr.Column(): template_dropdown = gr.Dropdown( label="Template Format", choices=list(TEMPLATES.keys()), value="ChatML (Qwen, OpenHermes, etc.)" ) model_lookup = gr.Textbox( label="Or enter model name to auto-detect", placeholder="Qwen/Qwen2.5-7B-Instruct" ) detect_btn = gr.Button("🔍 Detect Template", variant="secondary", size="sm") system_input = gr.Textbox( label="System Message", placeholder="You are a helpful coding assistant.", lines=2 ) user_input = gr.Textbox( label="User Message", placeholder="Write a Python function to reverse a string", lines=3 ) assistant_input = gr.Textbox( label="Assistant Message (optional, for multi-turn)", placeholder="Here's a function to reverse a string...", lines=2 ) gen_prompt_checkbox = gr.Checkbox( label="Include generation prompt (assistant start token)", value=True ) format_btn = gr.Button("🚀 Format Prompt", variant="primary") with gr.Column(): formatted_output = gr.Code( label="Formatted Prompt", language=None, lines=15 ) char_count = gr.Markdown("") with gr.Accordion("📋 Python Code Snippet", open=False): code_output = gr.Code(label="Transformers Code", language="python", lines=15) with gr.Accordion("🦙 llama.cpp Code Snippet", open=False): llama_output = gr.Code(label="llama-cpp-python Code", language="python", lines=12) with gr.TabItem("🔄 Multi-Turn"): with gr.Row(): with gr.Column(): multi_template = gr.Dropdown( label="Template Format", choices=list(TEMPLATES.keys()), value="ChatML (Qwen, OpenHermes, etc.)" ) conversation_input = gr.Textbox( label="Conversation (one message per line)", placeholder="""System: You are a helpful assistant User: Hello! Assistant: Hi there! How can I help? User: What's 2+2?""", lines=10 ) multi_gen_prompt = gr.Checkbox( label="Include generation prompt", value=True ) multi_format_btn = gr.Button("🚀 Format Conversation", variant="primary") with gr.Column(): multi_output = gr.Code( label="Formatted Conversation", language=None, lines=20 ) with gr.TabItem("📚 Template Reference"): gr.Markdown(""" ## Template Formats | Template | Used By | Special Tokens | |----------|---------|----------------| | **ChatML** | Qwen, OpenHermes, Nous Hermes | `<\|im_start\|>`, `<\|im_end\|>` | | **Llama 2 / Mistral** | Llama 2, Mistral v0.1-0.3 | `[INST]`, `[/INST]`, `<>` | | **Llama 3** | Llama 3, Llama 3.1 | `<\|start_header_id\|>`, `<\|eot_id\|>` | | **Alpaca** | Alpaca-style models | `### Instruction:`, `### Response:` | | **Vicuna** | Vicuna models | `USER:`, `ASSISTANT:` | | **Gemma** | Google Gemma | ``, `` | | **Phi-3** | Microsoft Phi-3 | `<\|system\|>`, `<\|user\|>`, `<\|assistant\|>` | | **Zephyr** | Zephyr models | `<\|system\|>`, `` | ## Common Mistakes ❌ **Using wrong template** → Model outputs garbage or repeats prompt ❌ **Missing generation prompt** → Model doesn't know to start generating ❌ **Wrong stop tokens** → Model generates forever or stops too early ❌ **System prompt in wrong place** → Model ignores instructions ## Tips ✅ Always check the model card for the correct template ✅ Use `tokenizer.apply_chat_template()` when possible ✅ Test your prompts here before running expensive inference! """) # Event handlers def update_char_count(text): return f"**Length:** {len(text)} characters, ~{len(text)//4} tokens" def on_format(template, system, user, assistant, gen_prompt): formatted = format_prompt(template, system, user, assistant, gen_prompt) code = generate_code_snippet(template, system, user) llama = generate_llama_cpp_snippet(formatted) count = update_char_count(formatted) return formatted, count, code, llama def on_detect(model_name): return get_template_from_model(model_name) detect_btn.click( fn=on_detect, inputs=[model_lookup], outputs=[template_dropdown] ) format_btn.click( fn=on_format, inputs=[template_dropdown, system_input, user_input, assistant_input, gen_prompt_checkbox], outputs=[formatted_output, char_count, code_output, llama_output] ) multi_format_btn.click( fn=format_multi_turn, inputs=[multi_template, conversation_input, multi_gen_prompt], outputs=[multi_output] ) demo.launch(server_name="0.0.0.0", server_port=7860)