import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel

# Model names
base_model_name = "unsloth/qwen2.5-coder-3b-instruct-bnb-4bit"
lora_model_name = "MarioCap/OCodeR_500-Qwen-2.5-Code-3B"

# Load tokenizer and base model
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, device_map="auto", trust_remote_code=True)
model = PeftModel.from_pretrained(base_model, lora_model_name)

# Create pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Chat/inference function
def generate(prompt, max_new_tokens=200, temperature=0.7):
    response = pipe(prompt, max_new_tokens=max_new_tokens, temperature=temperature, do_sample=True)
    return response[0]['generated_text']

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("### 💡 Code Assistant - OCodeR + Qwen 2.5")
    with gr.Row():
        prompt = gr.Textbox(label="Enter your coding prompt", placeholder="Write a Python function to reverse a string...")
    with gr.Row():
        max_tokens = gr.Slider(50, 1024, value=200, step=50, label="Max New Tokens")
        temp = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature")
    with gr.Row():
        output = gr.Textbox(label="Generated Output")
    with gr.Row():
        submit = gr.Button("Generate")
    
    submit.click(fn=generate, inputs=[prompt, max_tokens, temp], outputs=output)

demo.launch()