Spaces:

Derr11
/

Der11

Paused

App Files Files Community

Derr11 commited on 13 days ago

Commit

0db7e85

verified ·

1 Parent(s): 210a758

Update app.py

Browse files

Files changed (1) hide show

app.py +244 -111

app.py CHANGED Viewed

@@ -1,63 +1,165 @@
 import gradio as gr
 import torch
 import spaces
 from PIL import Image
-import numpy as np
 import os
 import tempfile
-# استيراد المكتبات الضرورية من Uni-MoE
 try:
     from uni_moe.model.processing_qwen2_vl import Qwen2VLProcessor
     from uni_moe.model.modeling_out import GrinQwen2VLOutForConditionalGeneration
     from uni_moe.qwen_vl_utils import process_mm_info
-    from uni_moe.model import deepspeed_moe_inference_utils
-except ImportError:
-    print("⚠️ Warning: Uni-MoE libraries not fully imported. Some features may not work.")
-# تحميل النموذج
-MODEL_NAME = "HIT-TMG/Uni-MoE-2.0-Omni"
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"🚀 Loading model: {MODEL_NAME}")
-print(f"📍 Device: {device}")
-# تحميل المعالج والنموذج
-try:
-    processor = Qwen2VLProcessor.from_pretrained(MODEL_NAME)
-    model = GrinQwen2VLOutForConditionalGeneration.from_pretrained(
-        MODEL_NAME,
-        torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
-        device_map="auto"
-    )
-    if device == "cuda":
-        model = model.cuda()
-    # تعيين data_args
-    processor.data_args = model.config
-    print("✅ Model loaded successfully!")
-except Exception as e:
-    print(f"❌ Error loading model: {str(e)}")
     processor = None
     model = None
-@spaces.GPU(duration=120)  # استخدام ZeroGPU لمدة 120 ثانية
 def generate_response(
     text_input: str,
-    image_input: Image.Image = None,
-    audio_input: str = None,
     temperature: float = 1.0,
-    max_new_tokens: int = 512
-):
     """
-    توليد استجابة من النموذج بناءً على المدخلات المختلفة
     """
     if model is None or processor is None:
-        return "❌ النموذج غير متاح حالياً. يرجى المحاولة لاحقاً."
     try:
-        # بناء رسالة المستخدم
         content = []
         # إضافة النص
@@ -65,29 +167,23 @@ def generate_response(
             content.append({"type": "text", "text": text_input})
         # إضافة الصورة
         if image_input is not None:
-            # حفظ الصورة مؤقتاً
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp_img:
-                image_input.save(tmp_img.name)
-                content.append({"type": "image", "image": tmp_img.name})
         # إضافة الصوت
         if audio_input is not None:
             content.append({"type": "audio", "audio": audio_input})
-        if not content:
-            return "⚠️ يرجى إدخال نص أو صورة أو صوت."
         # بناء الرسائل
-        messages = [{
-            "role": "user",
-            "content": content
-        }]
-        # معالجة الرسائل
         texts = processor.apply_chat_template(
-            messages,
-            tokenize=False,
             add_generation_prompt=True
         )
@@ -117,47 +213,73 @@ def generate_response(
         inputs = inputs.to(device=model.device)
         # التوليد
-        with torch.no_grad():
             output_ids = model.generate(
                 **inputs,
                 use_cache=True,
                 pad_token_id=processor.tokenizer.eos_token_id,
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
-                do_sample=True
             )
-        # فك تشفير النتيجة
         response = processor.batch_decode(
-            output_ids[:, inputs["input_ids"].shape[-1]:],
             skip_special_tokens=True
         )[0]
         return response
     except Exception as e:
-        return f"❌ حدث خطأ: {str(e)}"
-# إنشاء واجهة Gradio
-with gr.Blocks(
-    title="Uni-MoE 2.0 Omni Demo",
-    theme=gr.themes.Soft(),
-    css="""
-        .rtl { direction: rtl; text-align: right; }
-        .main-header { text-align: center; margin-bottom: 2rem; }
-    """
-) as demo:
-    gr.Markdown("""
     <div class="main-header">
-    # 🚀 Uni-MoE 2.0 Omni Demo
-    نموذج متعدد الوسائط متقدم يدعم فهم وتوليد **النصوص والصور والصوت**
-    An advanced omnimodal model supporting understanding and generation of **text, images, and audio**
     </div>
     """)
@@ -167,15 +289,17 @@ with gr.Blocks(
             text_input = gr.Textbox(
                 label="النص / Text",
-                placeholder="اكتب سؤالك أو وصفك هنا... / Enter your question or description here...",
-                lines=3,
                 rtl=True
             )
-            image_input = gr.Image(
-                label="الصورة (اختياري) / Image (Optional)",
-                type="pil"
-            )
             audio_input = gr.Audio(
                 label="الصوت (اختياري) / Audio (Optional)",
@@ -184,42 +308,55 @@ with gr.Blocks(
             with gr.Accordion("⚙️ إعدادات متقدمة / Advanced Settings", open=False):
                 temperature = gr.Slider(
-                    minimum=0.1,
-                    maximum=2.0,
-                    value=1.0,
-                    step=0.1,
-                    label="Temperature"
                 )
                 max_tokens = gr.Slider(
-                    minimum=64,
-                    maximum=2048,
-                    value=512,
-                    step=64,
-                    label="Max New Tokens"
                 )
-            submit_btn = gr.Button("🎯 توليد / Generate", variant="primary")
-            clear_btn = gr.Button("🗑️ مسح / Clear")
         with gr.Column(scale=1):
             gr.Markdown("### 💬 النتيجة / Output")
             output = gr.Textbox(
                 label="الاستجابة / Response",
-                lines=15,
                 show_copy_button=True,
                 rtl=True
             )
     # أمثلة
     gr.Markdown("### 📚 أمثلة / Examples")
     gr.Examples(
         examples=[
-            ["ما هي عاصمة مصر؟", None, None],
-            ["صف هذه الصورة بالتفصيل", "https://picsum.photos/400/300", None],
-            ["What is the capital of France?", None, None],
-            ["Describe this image in detail", "https://picsum.photos/400/300", None],
         ],
         inputs=[text_input, image_input, audio_input],
     )
@@ -227,32 +364,28 @@ with gr.Blocks(
     # معلومات إضافية
     gr.Markdown("""
     ---
-    ### ℹ️ معلومات / Information
-    **Uni-MoE 2.0 Omni** هو نموذج لغوي متعدد الوسائط (Omnimodal) مبني على معماريات:
-    - 🧠 **Mixture-of-Experts (MoE)** لكفاءة الحوسبة
-    - 🔄 **Qwen2.5-7B** كقاعدة أساسية
-    - 🎯 **Omni-Modality 3D RoPE** لمحاذاة متعددة الوسائط
-    **القدرات:**
-    - ✅ فهم النصوص والصور والصوت والفيديو
-    - ✅ توليد النصوص والصور والصوت
-    - ✅ استدلال متعدد الوسائط
-    📄 **ورقة بحثية:** [arXiv:2511.12609](https://arxiv.org/abs/2511.12609)
-    🔗 **GitHub:** [HITsz-TMG/Uni-MoE](https://github.com/HITsz-TMG/Uni-MoE)
-    ---
-    <p style="text-align: center; color: #666;">
-    تم إنشاؤه باستخدام Gradio و ZeroGPU 🚀
-    </p>
     """)
     # ربط الأحداث
     submit_btn.click(
         fn=generate_response,
-        inputs=[text_input, image_input, audio_input, temperature, max_tokens],
         outputs=output
     )
@@ -264,7 +397,7 @@ with gr.Blocks(
 # تشغيل التطبيق
 if __name__ == "__main__":
-    demo.queue(max_size=10)
     demo.launch(
         share=False,
         show_error=True,

+"""
+نسخة محسّنة من app.py مع دعم Quantization و Memory Optimization
+للنماذج الكبيرة على ZeroGPU
+Optimized version of app.py with Quantization and Memory Optimization
+for large models on ZeroGPU
+"""
 import gradio as gr
 import torch
 import spaces
 from PIL import Image
 import os
 import tempfile
+import gc
+from typing import Optional, Union
+# استيراد المكتبات الضرورية
 try:
     from uni_moe.model.processing_qwen2_vl import Qwen2VLProcessor
     from uni_moe.model.modeling_out import GrinQwen2VLOutForConditionalGeneration
     from uni_moe.qwen_vl_utils import process_mm_info
+    from transformers import BitsAndBytesConfig
+except ImportError as e:
+    print(f"⚠️ Warning: Import error - {e}")
+    print("Some features may not work properly.")
+# ==================== الإعدادات / Configuration ====================
+# اختر النموذج المناسب
+# Choose appropriate model
+MODEL_NAME = "HIT-TMG/Uni-MoE-2.0-Omni"  # النموذج الكامل / Full model
+# MODEL_NAME = "HIT-TMG/Uni-MoE-2.0-Base"  # البديل الأصغر / Smaller alternative
+# إعدادات التحسين / Optimization settings
+USE_4BIT = True  # استخدام 4-bit quantization لتوفير الذاكرة
+USE_8BIT = False  # بديل: استخدام 8-bit quantization
+USE_FLASH_ATTENTION = True  # استخدام Flash Attention للسرعة
+MAX_MEMORY = "20GB"  # الحد الأقصى للذاكرة المستخدمة
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# ==================== تحميل النموذج / Model Loading ====================
+print("="*60)
+print(f"🚀 Loading Uni-MoE 2.0 Model")
+print(f"📍 Model: {MODEL_NAME}")
+print(f"🖥️ Device: {device}")
+print(f"⚙️ 4-bit Quantization: {USE_4BIT}")
+print(f"⚙️ 8-bit Quantization: {USE_8BIT}")
+print("="*60)
+def load_model_optimized():
+    """تحميل النموذج بطريقة محسّنة"""
+    global processor, model
+    try:
+        # تحميل المعالج
+        print("📥 Loading processor...")
+        processor = Qwen2VLProcessor.from_pretrained(MODEL_NAME)
+        # إعداد Quantization Config
+        quantization_config = None
+        if USE_4BIT:
+            print("⚙️ Setting up 4-bit quantization...")
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4"
+            )
+        elif USE_8BIT:
+            print("⚙️ Setting up 8-bit quantization...")
+            quantization_config = BitsAndBytesConfig(
+                load_in_8bit=True,
+            )
+        # تحميل النموذج
+        print("📥 Loading model (this may take a few minutes)...")
+        load_kwargs = {
+            "device_map": "auto",
+            "torch_dtype": torch.float16 if not USE_4BIT else None,
+            "trust_remote_code": True,
+        }
+        if quantization_config:
+            load_kwargs["quantization_config"] = quantization_config
+        if device == "cuda" and not USE_4BIT and not USE_8BIT:
+            load_kwargs["max_memory"] = {0: MAX_MEMORY}
+        model = GrinQwen2VLOutForConditionalGeneration.from_pretrained(
+            MODEL_NAME,
+            **load_kwargs
+        )
+        # تعيين data_args
+        processor.data_args = model.config
+        print("✅ Model loaded successfully!")
+        print(f"💾 Model size: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B parameters")
+        return True
+    except Exception as e:
+        print(f"❌ Error loading model: {str(e)}")
+        return False
+# تحميل النموذج
+model_loaded = load_model_optimized()
+if not model_loaded:
     processor = None
     model = None
+# ==================== دوال مساعدة / Helper Functions ====================
+def clear_gpu_memory():
+    """تنظيف ذاكرة GPU"""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()
+def estimate_tokens(text: str) -> int:
+    """تقدير عدد التوكنات"""
+    return len(text.split()) * 1.3
+# ==================== دالة التوليد الرئيسية / Main Generation Function ====================
+@spaces.GPU(duration=120)
 def generate_response(
     text_input: str,
+    image_input: Optional[Image.Image] = None,
+    audio_input: Optional[str] = None,
     temperature: float = 1.0,
+    max_new_tokens: int = 512,
+    top_p: float = 0.9,
+    repetition_penalty: float = 1.1
+) -> str:
     """
+    توليد استجابة من النموذج
+    Generate response from the model
     """
+    # التحقق من توفر النموذج
     if model is None or processor is None:
+        return "❌ النموذج غير متاح. يرجى التحقق من السجلات.\n❌ Model not available. Please check logs."
+    # تنظيف الذاكرة قبل البدء
+    clear_gpu_memory()
     try:
+        # التحقق من المدخلات
+        if not text_input and image_input is None and audio_input is None:
+            return "⚠️ يرجى إدخال نص أو صورة أو صوت على الأقل.\n⚠️ Please provide at least text, image, or audio input."
+        # بناء محتوى الرسالة
         content = []
         # إضافة النص
             content.append({"type": "text", "text": text_input})
         # إضافة الصورة
+        temp_image_path = None
         if image_input is not None:
+            temp_image_path = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg").name
+            image_input.save(temp_image_path)
+            content.append({"type": "image", "image": temp_image_path})
         # إضافة الصوت
         if audio_input is not None:
             content.append({"type": "audio", "audio": audio_input})
         # بناء الرسائل
+        messages = [{"role": "user", "content": content}]
+        # معالجة النص
         texts = processor.apply_chat_template(
+            messages,
+            tokenize=False,
             add_generation_prompt=True
         )
         inputs = inputs.to(device=model.device)
         # التوليد
+        with torch.inference_mode():
             output_ids = model.generate(
                 **inputs,
                 use_cache=True,
                 pad_token_id=processor.tokenizer.eos_token_id,
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
+                do_sample=True,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty
             )
+        # فك التشفير
         response = processor.batch_decode(
+            output_ids[:, inputs["input_ids"].shape[-1]:],
             skip_special_tokens=True
         )[0]
+        # تنظيف الملفات المؤقتة
+        if temp_image_path and os.path.exists(temp_image_path):
+            os.unlink(temp_image_path)
+        # تنظيف الذاكرة
+        clear_gpu_memory()
         return response
     except Exception as e:
+        clear_gpu_memory()
+        error_msg = f"❌ خطأ / Error: {str(e)}"
+        print(error_msg)
+        return error_msg
+# ==================== واجهة Gradio / Gradio Interface ====================
+css = """
+.rtl { direction: rtl; text-align: right; }
+.main-header {
+    text-align: center;
+    margin-bottom: 2rem;
+    padding: 2rem;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    border-radius: 10px;
+    color: white;
+}
+.note-box {
+    padding: 1rem;
+    background: #f0f9ff;
+    border-left: 4px solid #3b82f6;
+    border-radius: 4px;
+    margin: 1rem 0;
+}
+"""
+with gr.Blocks(title="Uni-MoE 2.0 Omni - Optimized", theme=gr.themes.Soft(), css=css) as demo:
+    gr.HTML("""
     <div class="main-header">
+        <h1>🚀 Uni-MoE 2.0 Omni Demo</h1>
+        <p style="font-size: 1.1em; margin-top: 1rem;">
+            نموذج متعدد الوسائط متقدم - Advanced Omnimodal Model
+        </p>
+        <p style="font-size: 0.9em; opacity: 0.9; margin-top: 0.5rem;">
+            يدعم فهم وتوليد النصوص والصور والصوت<br>
+            Supports understanding and generation of text, images, and audio
+        </p>
     </div>
     """)
             text_input = gr.Textbox(
                 label="النص / Text",
+                placeholder="اكتب سؤالك أو وصفك هنا...\nEnter your question or description here...",
+                lines=4,
                 rtl=True
             )
+            with gr.Row():
+                image_input = gr.Image(
+                    label="الصورة (اختياري) / Image (Optional)",
+                    type="pil",
+                    height=300
+                )
             audio_input = gr.Audio(
                 label="الصوت (اختياري) / Audio (Optional)",
             with gr.Accordion("⚙️ إعدادات متقدمة / Advanced Settings", open=False):
                 temperature = gr.Slider(
+                    minimum=0.1, maximum=2.0, value=0.7, step=0.1,
+                    label="Temperature (الإبداعية / Creativity)"
                 )
                 max_tokens = gr.Slider(
+                    minimum=64, maximum=2048, value=512, step=64,
+                    label="Max Tokens (الطول الأقصى / Max Length)"
+                )
+                top_p = gr.Slider(
+                    minimum=0.1, maximum=1.0, value=0.9, step=0.05,
+                    label="Top P (التنوع / Diversity)"
+                )
+                repetition_penalty = gr.Slider(
+                    minimum=1.0, maximum=2.0, value=1.1, step=0.1,
+                    label="Repetition Penalty (تجنب التكرار / Avoid Repetition)"
                 )
+            with gr.Row():
+                submit_btn = gr.Button("🎯 توليد / Generate", variant="primary", size="lg")
+                clear_btn = gr.Button("🗑️ مسح / Clear", size="lg")
         with gr.Column(scale=1):
             gr.Markdown("### 💬 النتيجة / Output")
             output = gr.Textbox(
                 label="الاستجابة / Response",
+                lines=20,
                 show_copy_button=True,
                 rtl=True
             )
+    # ملاحظات مهمة
+    gr.HTML("""
+    <div class="note-box">
+        <h3>📌 ملاحظات مهمة / Important Notes</h3>
+        <ul>
+            <li>⏱️ قد يستغرق التوليد 30-60 ثانية / Generation may take 30-60 seconds</li>
+            <li>💾 يستخدم النموذج quantization لتوفير الذاكرة / Model uses quantization to save memory</li>
+            <li>🔄 يتم تنظيف الذاكرة تلقائياً بعد كل استخدام / Memory is cleared automatically after each use</li>
+        </ul>
+    </div>
+    """)
     # أمثلة
     gr.Markdown("### 📚 أمثلة / Examples")
     gr.Examples(
         examples=[
+            ["ما هي عاصمة مصر؟ What is the capital of Egypt?", None, None],
+            ["صف هذه الصورة بالتفصيل\nDescribe this image in detail", "https://picsum.photos/400/300", None],
+            ["قارن بين Python و JavaScript\nCompare Python and JavaScript", None, None],
         ],
         inputs=[text_input, image_input, audio_input],
     )
     # معلومات إضافية
     gr.Markdown("""
     ---
+    ### ℹ️ حول النموذج / About the Model
+    **Uni-MoE 2.0 Omni** بني على:
+    - 🧠 Mixture-of-Experts (MoE) architecture
+    - 📊 Qwen2.5-7B base model (~33B parameters with experts)
+    - 🌐 Omni-Modality 3D RoPE for cross-modal alignment
+    - ⚡ Dynamic-Capacity routing mechanism
+    **الأداء / Performance:**
+    - ✅ +7% على فهم الفيديو / video understanding
+    - ✅ +4% على الاستدلال السمعي-البصري / audio-visual reasoning
+    - ✅ متفوق على Qwen2.5-Omni في 50+ معياراً / benchmarks
+    📄 [ورقة بحثية / Paper](https://arxiv.org/abs/2511.12609) |
+    💻 [GitHub](https://github.com/HITsz-TMG/Uni-MoE) |
+    🤗 [Model](https://huggingface.co/HIT-TMG/Uni-MoE-2.0-Omni)
     """)
     # ربط الأحداث
     submit_btn.click(
         fn=generate_response,
+        inputs=[text_input, image_input, audio_input, temperature, max_tokens, top_p, repetition_penalty],
         outputs=output
     )
 # تشغيل التطبيق
 if __name__ == "__main__":
+    demo.queue(max_size=20, default_concurrency_limit=5)
     demo.launch(
         share=False,
         show_error=True,