Spaces:

Derr11
/

Der11

Paused

App Files Files Community

Derr11 commited on 12 days ago

Commit

02a5a58

verified ·

1 Parent(s): fbc8aaf

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -387

app.py CHANGED Viewed

@@ -1,406 +1,101 @@
-"""
-نسخة محسّنة من app.py مع دعم Quantization و Memory Optimization
-للنماذج الكبيرة على ZeroGPU
-Optimized version of app.py with Quantization and Memory Optimization
-for large models on ZeroGPU
-"""
 import gradio as gr
 import torch
-import spaces
-from PIL import Image
 import os
-import tempfile
-import gc
-from typing import Optional, Union
-# استيراد المكتبات الضرورية
-try:
-    from uni_moe.model.processing_qwen2_vl import Qwen2VLProcessor
-    from uni_moe.model.modeling_out import GrinQwen2VLOutForConditionalGeneration
-    from uni_moe.qwen_vl_utils import process_mm_info
-    from transformers import BitsAndBytesConfig
-except ImportError as e:
-    print(f"⚠️ Warning: Import error - {e}")
-    print("Some features may not work properly.")
-# ==================== الإعدادات / Configuration ====================
-# اختر النموذج المناسب
-# Choose appropriate model
-MODEL_NAME = "HIT-TMG/Uni-MoE-2.0-Omni"  # النموذج الكامل / Full model
-# MODEL_NAME = "HIT-TMG/Uni-MoE-2.0-Base"  # البديل الأصغر / Smaller alternative
-# إعدادات التحسين / Optimization settings
-USE_4BIT = True  # استخدام 4-bit quantization لتوفير الذاكرة
-USE_8BIT = False  # بديل: استخدام 8-bit quantization
-USE_FLASH_ATTENTION = True  # استخدام Flash Attention للسرعة
-MAX_MEMORY = "20GB"  # الحد الأقصى للذاكرة المستخدمة
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# ==================== تحميل النموذج / Model Loading ====================
-print("="*60)
-print(f"🚀 Loading Uni-MoE 2.0 Model")
-print(f"📍 Model: {MODEL_NAME}")
-print(f"🖥️ Device: {device}")
-print(f"⚙️ 4-bit Quantization: {USE_4BIT}")
-print(f"⚙️ 8-bit Quantization: {USE_8BIT}")
-print("="*60)
-def load_model_optimized():
-    """تحميل النموذج بطريقة محسّنة"""
-    global processor, model
-    try:
-        # تحميل المعالج
-        print("📥 Loading processor...")
-        processor = Qwen2VLProcessor.from_pretrained(MODEL_NAME)
-        # إعداد Quantization Config
-        quantization_config = None
-        if USE_4BIT:
-            print("⚙️ Setting up 4-bit quantization...")
-            quantization_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_compute_dtype=torch.float16,
-                bnb_4bit_use_double_quant=True,
-                bnb_4bit_quant_type="nf4"
-            )
-        elif USE_8BIT:
-            print("⚙️ Setting up 8-bit quantization...")
-            quantization_config = BitsAndBytesConfig(
-                load_in_8bit=True,
-            )
-        # تحميل النموذج
-        print("📥 Loading model (this may take a few minutes)...")
-        load_kwargs = {
-            "device_map": "auto",
-            "torch_dtype": torch.float16 if not USE_4BIT else None,
-            "trust_remote_code": True,
-        }
-        if quantization_config:
-            load_kwargs["quantization_config"] = quantization_config
-        if device == "cuda" and not USE_4BIT and not USE_8BIT:
-            load_kwargs["max_memory"] = {0: MAX_MEMORY}
-        model = GrinQwen2VLOutForConditionalGeneration.from_pretrained(
-            MODEL_NAME,
-            **load_kwargs
-        )
-        # تعيين data_args
-        processor.data_args = model.config
-        print("✅ Model loaded successfully!")
-        print(f"💾 Model size: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B parameters")
-        return True
-    except Exception as e:
-        print(f"❌ Error loading model: {str(e)}")
-        return False
-# تحميل النموذج
-model_loaded = load_model_optimized()
-if not model_loaded:
-    processor = None
-    model = None
-# ==================== دوال مساعدة / Helper Functions ====================
-def clear_gpu_memory():
-    """تنظيف ذاكرة GPU"""
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        gc.collect()
-def estimate_tokens(text: str) -> int:
-    """تقدير عدد التوكنات"""
-    return len(text.split()) * 1.3
-# ==================== دالة التوليد الرئيسية / Main Generation Function ====================
-@spaces.GPU(duration=120)
-def generate_response(
-    text_input: str,
-    image_input: Optional[Image.Image] = None,
-    audio_input: Optional[str] = None,
-    temperature: float = 1.0,
-    max_new_tokens: int = 512,
-    top_p: float = 0.9,
-    repetition_penalty: float = 1.1
-) -> str:
-    """
-    توليد استجابة من النموذج
-    Generate response from the model
-    """
-    # التحقق من توفر النموذج
-    if model is None or processor is None:
-        return "❌ النموذج غير متاح. يرجى التحقق من السجلات.\n❌ Model not available. Please check logs."
-    # تنظيف الذاكرة قبل البدء
-    clear_gpu_memory()
-    try:
-        # التحقق من المدخلات
-        if not text_input and image_input is None and audio_input is None:
-            return "⚠️ يرجى إدخال نص أو صورة أو صوت على الأقل.\n⚠️ Please provide at least text, image, or audio input."
-        # بناء محتوى الرسالة
-        content = []
-        # إضافة النص
-        if text_input:
-            content.append({"type": "text", "text": text_input})
-        # إضافة الصورة
-        temp_image_path = None
-        if image_input is not None:
-            temp_image_path = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg").name
-            image_input.save(temp_image_path)
-            content.append({"type": "image", "image": temp_image_path})
-        # إضافة الصوت
-        if audio_input is not None:
-            content.append({"type": "audio", "audio": audio_input})
-        # بناء الرسائل
-        messages = [{"role": "user", "content": content}]
-        # معالجة النص
-        texts = processor.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        # استبدال العلامات الخاصة
-        texts = texts.replace(
-            "<image>", "<|vision_start|><|image_pad|><|vision_end|>"
-        ).replace(
-            "<audio>", "<|audio_start|><|audio_pad|><|audio_end|>"
-        ).replace(
-            "<video>", "<|vision_start|><|video_pad|><|vision_end|>"
-        )
-        # معالجة الوسائط
-        image_inputs, video_inputs, audio_inputs = process_mm_info(messages)
-        # تجهيز المدخلات
-        inputs = processor(
-            text=texts,
-            images=image_inputs,
-            videos=video_inputs,
-            audios=audio_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs["input_ids"] = inputs["input_ids"].unsqueeze(0)
-        inputs = inputs.to(device=model.device)
-        # التوليد
-        with torch.inference_mode():
-            output_ids = model.generate(
-                **inputs,
-                use_cache=True,
-                pad_token_id=processor.tokenizer.eos_token_id,
-                max_new_tokens=max_new_tokens,
-                temperature=temperature,
-                do_sample=True,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty
-            )
-        # فك التشفير
-        response = processor.batch_decode(
-            output_ids[:, inputs["input_ids"].shape[-1]:],
-            skip_special_tokens=True
-        )[0]
-        # تنظيف الملفات المؤقتة
-        if temp_image_path and os.path.exists(temp_image_path):
-            os.unlink(temp_image_path)
-        # تنظيف الذاكرة
-        clear_gpu_memory()
-        return response
-    except Exception as e:
-        clear_gpu_memory()
-        error_msg = f"❌ خطأ / Error: {str(e)}"
-        print(error_msg)
-        return error_msg
-# ==================== واجهة Gradio / Gradio Interface ====================
-css = """
-.rtl { direction: rtl; text-align: right; }
-.main-header {
-    text-align: center;
-    margin-bottom: 2rem;
-    padding: 2rem;
-    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-    border-radius: 10px;
-    color: white;
-}
-.note-box {
-    padding: 1rem;
-    background: #f0f9ff;
-    border-left: 4px solid #3b82f6;
-    border-radius: 4px;
-    margin: 1rem 0;
-}
-"""
-with gr.Blocks(title="Uni-MoE 2.0 Omni - Optimized", theme=gr.themes.Soft(), css=css) as demo:
-    gr.HTML("""
-    <div class="main-header">
-        <h1>🚀 Uni-MoE 2.0 Omni Demo</h1>
-        <p style="font-size: 1.1em; margin-top: 1rem;">
-            نموذج متعدد الوسائط متقدم - Advanced Omnimodal Model
-        </p>
-        <p style="font-size: 0.9em; opacity: 0.9; margin-top: 0.5rem;">
-            يدعم فهم وتوليد النصوص والصور والصوت<br>
-            Supports understanding and generation of text, images, and audio
-        </p>
-    </div>
-    """)
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("### 📝 المدخلات / Inputs")
-            text_input = gr.Textbox(
-                label="النص / Text",
-                placeholder="اكتب سؤالك أو وصفك هنا...\nEnter your question or description here...",
-                lines=4,
-                rtl=True
-            )
-            with gr.Row():
-                image_input = gr.Image(
-                    label="الصورة (اختياري) / Image (Optional)",
-                    type="pil",
-                    height=300
-                )
-            audio_input = gr.Audio(
-                label="الصوت (اختياري) / Audio (Optional)",
-                type="filepath"
-            )
-            with gr.Accordion("⚙️ إعدادات متقدمة / Advanced Settings", open=False):
-                temperature = gr.Slider(
-                    minimum=0.1, maximum=2.0, value=0.7, step=0.1,
-                    label="Temperature (الإبداعية / Creativity)"
-                )
-                max_tokens = gr.Slider(
-                    minimum=64, maximum=2048, value=512, step=64,
-                    label="Max Tokens (الطول الأقصى / Max Length)"
-                )
-                top_p = gr.Slider(
-                    minimum=0.1, maximum=1.0, value=0.9, step=0.05,
-                    label="Top P (التنوع / Diversity)"
-                )
-                repetition_penalty = gr.Slider(
-                    minimum=1.0, maximum=2.0, value=1.1, step=0.1,
-                    label="Repetition Penalty (تجنب التكرار / Avoid Repetition)"
-                )
-            with gr.Row():
-                submit_btn = gr.Button("🎯 توليد / Generate", variant="primary", size="lg")
-                clear_btn = gr.Button("🗑️ مسح / Clear", size="lg")
-        with gr.Column(scale=1):
-            gr.Markdown("### 💬 النتيجة / Output")
-            output = gr.Textbox(
-                label="الاستجابة / Response",
-                lines=20,
-                show_copy_button=True,
-                rtl=True
-            )
-    # ملاحظات مهمة
-    gr.HTML("""
-    <div class="note-box">
-        <h3>📌 ملاحظات مهمة / Important Notes</h3>
-        <ul>
-            <li>⏱️ قد يستغرق التوليد 30-60 ثانية / Generation may take 30-60 seconds</li>
-            <li>💾 يستخدم النموذج quantization لتوفير الذاكرة / Model uses quantization to save memory</li>
-            <li>🔄 يتم تنظيف الذاكرة تلقائياً بعد كل استخدام / Memory is cleared automatically after each use</li>
-        </ul>
-    </div>
-    """)
-    # أمثلة
-    gr.Markdown("### 📚 أمثلة / Examples")
-    gr.Examples(
-        examples=[
-            ["ما هي عاصمة مصر؟ What is the capital of Egypt?", None, None],
-            ["صف هذه الصورة بالتفصيل\nDescribe this image in detail", "https://picsum.photos/400/300", None],
-            ["قارن بين Python و JavaScript\nCompare Python and JavaScript", None, None],
-        ],
-        inputs=[text_input, image_input, audio_input],
     )
-    # معلومات ��ضافية
-    gr.Markdown("""
-    ---
-    ### ℹ️ حول النموذج / About the Model
-    **Uni-MoE 2.0 Omni** بني على:
-    - 🧠 Mixture-of-Experts (MoE) architecture
-    - 📊 Qwen2.5-7B base model (~33B parameters with experts)
-    - 🌐 Omni-Modality 3D RoPE for cross-modal alignment
-    - ⚡ Dynamic-Capacity routing mechanism
-    **الأداء / Performance:**
-    - ✅ +7% على فهم الفيديو / video understanding
-    - ✅ +4% على الاستدلال السمعي-البصري / audio-visual reasoning
-    - ✅ متفوق على Qwen2.5-Omni في 50+ معياراً / benchmarks
-    📄 [ورقة بحثية / Paper](https://arxiv.org/abs/2511.12609) |
-    💻 [GitHub](https://github.com/HITsz-TMG/Uni-MoE) |
-    🤗 [Model](https://huggingface.co/HIT-TMG/Uni-MoE-2.0-Omni)
-    """)
-    # ربط الأحداث
-    submit_btn.click(
-        fn=generate_response,
-        inputs=[text_input, image_input, audio_input, temperature, max_tokens, top_p, repetition_penalty],
-        outputs=output
-    )
-    clear_btn.click(
-        fn=lambda: (None, None, None, None),
-        outputs=[text_input, image_input, audio_input, output]
-    )
-# تشغيل التطبيق
-if __name__ == "__main__":
-    demo.queue(max_size=20, default_concurrency_limit=5)
-    demo.launch(
-        share=False,
-        show_error=True,
-        server_name="0.0.0.0",
-        server_port=7860
-    )

 import gradio as gr
 import torch
 import os
+# استيراد المكتبات الخاصة بالنموذج (تأكد أن مجلد uni_moe موجود بجانب هذا الملف)
+from uni_moe.model.processing_qwen2_vl import Qwen2VLProcessor
+from uni_moe.model.modeling_out import GrinQwen2VLOutForConditionalGeneration
+from uni_moe.qwen_vl_utils import process_mm_info
+# إعداد النموذج
+MODEL_ID = "HIT-TMG/Uni-MoE-2.0-Omni"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Loading model on {DEVICE}...")
+# تحميل المعالج والنموذج
+processor = Qwen2VLProcessor.from_pretrained(MODEL_ID)
+model = GrinQwen2VLOutForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.bfloat16
+).to(DEVICE)
+processor.data_args = model.config
+def generate_response(text_input, image_path, audio_path):
+    # تجهيز محتوى الرسالة
+    content = []
+    # إضافة النص مع التاجات الخاصة إذا وجدت وسائط
+    prompt_text = text_input
+    if audio_path:
+        content.append({"type": "audio", "audio": audio_path})
+        prompt_text = "<audio>\n" + prompt_text
+    if image_path:
+        content.append({"type": "image", "image": image_path})
+        prompt_text = "<image>\n" + prompt_text
+    content.append({"type": "text", "text": prompt_text})
+    messages = [{
+        "role": "user",
+        "content": content
+    }]
+    # معالجة القوالب (Chat Template)
+    texts = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # استبدال التاجات الخاصة كما في المثال الأصلي
+    texts = texts.replace("<image>","<|vision_start|><|image_pad|><|vision_end|>") \
+                 .replace("<audio>","<|audio_start|><|audio_pad|><|audio_end|>") \
+                 .replace("<video>","<|vision_start|><|video_pad|><|vision_end|>")
+    # معالجة الوسائط
+    image_inputs, video_inputs, audio_inputs = process_mm_info(messages)
+    # تجهيز المدخلات للنموذج
+    inputs = processor(
+        text=texts,
+        images=image_inputs,
+        videos=video_inputs,
+        audios=audio_inputs,
+        padding=True,
+        return_tensors="pt",
     )
+    # إضافة بعد جديد للـ inputs ونقلها للـ GPU
+    if "input_ids" in inputs:
+        inputs["input_ids"] = inputs["input_ids"].unsqueeze(0) # Unsqueeze كما في المثال
+    inputs = inputs.to(device=model.device)
+    # التوليد
+    with torch.no_grad():
+        output_ids = model.generate(
+            **inputs,
+            use_cache=True,
+            pad_token_id=processor.tokenizer.eos_token_id,
+            max_new_tokens=2048, # تم التقليل قليلاً لتسريع الاستجابة في الويب
+            temperature=0.7,
+            do_sample=True
+        )
+    # فك التشفير واستخراج النص فقط
+    response = processor.batch_decode(output_ids[:, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)[0]
+    return response
+# بناء واجهة Gradio
+with gr.Interface(
+    fn=generate_response,
+    inputs=[
+        gr.Textbox(label="Question/Prompt", placeholder="Describe the image or audio..."),
+        gr.Image(type="filepath", label="Upload Image (Optional)"),
+        gr.Audio(type="filepath", label="Upload Audio (Optional)")
+    ],
+    outputs=gr.Textbox(label="Uni-MoE Response"),
+    title="Uni-MoE 2.0 Omni Demo",
+    description="Upload an image or audio and ask questions about them using Uni-MoE 2.0."
+) as demo:
+    demo.launch()