import gradio as gr
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
import torch
import spaces # مكتبة ZeroGPU

# 1. إعدادات النموذج (Qwen3-Omni-Thinking)
MODEL_ID = "Qwen/Qwen3-Omni-30B-A3B-Thinking"

print(f"جاري تحميل النموذج العملاق {MODEL_ID}... هذا سيستغرق بضعة دقائق.")

# إعداد الضغط (4-bit Quantization) لتناسب ذاكرة ZeroGPU
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# تحميل الـ Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# تحميل النموذج مع الضغط
# التعديل هنا: استخدام AutoModel بدلاً من AutoModelForCausalLM
model = AutoModel.from_pretrained(
    MODEL_ID,
    quantization_config=nf4_config,
    device_map="auto",
    trust_remote_code=True
)

print("تم تحميل النموذج بنجاح! المعلم جاهز.")

# 2. دالة التفكير والرد
@spaces.GPU(duration=120)
def chat_with_thinking_model(message, history):
    messages = []
    
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
    
    messages.append({"role": "user", "content": message})

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=1024,
        temperature=0.7
    )
    
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

# 3. واجهة المستخدم
custom_css = """
#chatbot {min-height: 400px;}
"""

with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🧠 Nasaq AI Tutor (Thinking Mode)")
    gr.Markdown("هذا النموذج يستخدم **Qwen3-Omni-Thinking**. ستلاحظ أنه قد يكتب خطوات تفكيره قبل الإجابة النهائية.")
    
    chatbot = gr.ChatInterface(
        fn=chat_with_thinking_model,
        examples=["اشرح لي النظرية النسبية وكأنني طفل في الخامسة", "حل المعادلة: س^2 + 5س + 6 = 0 مع الشرح"],
        cache_examples=False,
    )

if __name__ == "__main__":
    demo.launch()