import gradio as gr
import torch
import unsloth
from unsloth import FastLanguageModel
from transformers import TextStreamer

# Define model and tokenizer loading parameters
MODEL_NAME = "unsloth/llama-3-8b-Instruct-bnb-4bit"
max_seq_length = 2048
load_in_4bit = True
dtype = None

# Load the base model and then the fine-tuned LoRA adapter
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Load the LoRA adapter from the saved directory
# This assumes 'hplc_lora' is in the current working directory
# or in the directory where the app will be deployed.
model.load_adapter("hplc_lora")

# Ensure tokenizer pad_token is set for generation
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# --- Simple inference function using your fine-tuned model ---
def hplc_chat(user_prompt):
    system_prompt = (
        "You are an expert analytical chemist specializing in HPLC method development "
        "and troubleshooting. Give concise, step-by-step actions with reasoning, "
        "numeric targets, and acceptance criteria."
    )
    # Combine into chat-style prompt string
    text = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{system_prompt}\n\n" \
           f"<|start_header_id|>user<|end_header_id|>\n{user_prompt}\n\n" \
           f"<|start_header_id|>assistant<|end_header_id|>\n"
    inputs = tokenizer([text], return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=400,
            temperature=0.4,
            top_p=0.9,
            eos_token_id=tokenizer.eos_token_id,  # Use EOS token to stop generation
            pad_token_id=tokenizer.pad_token_id  # Use pad_token_id during generation
        )
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    # Extract only the assistant's response part
    return answer.split("assistant")[-1].strip()

# --- Build the Gradio interface ---
demo = gr.Interface(
    fn=hplc_chat,
    inputs=gr.Textbox(label="Enter your HPLC question or problem", lines=5, placeholder="e.g., Retention time drift >5%"),
    outputs=gr.Textbox(label="Model response", lines=12),
    title="🧪 Llama-3 HPLC Method Development & Troubleshooting",
    description="Ask about gradient setup, peak shape, pressure, or column issues. The model will suggest step-by-step fixes and acceptance criteria.",
)

# Launch the Gradio app if the script is run directly
if __name__ == "__main__":
    demo.launch(share=True)