Spaces:

Derr11
/

Der11

Paused

App Files Files Community

Derr11 commited on 15 days ago

Commit

210a758

verified ·

1 Parent(s): ce07ef2

Update app.py

Browse files

Files changed (1) hide show

app.py +227 -346

app.py CHANGED Viewed

@@ -1,392 +1,273 @@
-import os
-import torch
 import gradio as gr
 import spaces
 from PIL import Image
-from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
-import warnings
-warnings.filterwarnings("ignore")
-# =========================================================
-# إعدادات النموذج
-# =========================================================
-MODEL_ID = "openbmb/MiniCPM-o-2_6"
-# تحميل كسول للنموذج
-model = None
-tokenizer = None
-def load_model():
-    """تحميل النموذج عند الحاجة فقط"""
-    global model, tokenizer
-    if model is not None:
-        return
-    print(f"Loading {MODEL_ID}...")
-    # استخدام float16 للتوافق مع ZeroGPU
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-    try:
-        # تحميل tokenizer أولاً
-        tokenizer = AutoTokenizer.from_pretrained(
-            MODEL_ID,
-            trust_remote_code=True,
-            use_fast=False
-        )
-        # تحميل النموذج مع trust_remote_code=True
-        model = AutoModel.from_pretrained(
-            MODEL_ID,
-            trust_remote_code=True,
-            torch_dtype=dtype,
-            low_cpu_mem_usage=True,
-            attn_implementation="eager",
-        ).eval()
-        if torch.cuda.is_available():
-            model = model.cuda()
-        print("Model loaded successfully!")
-    except Exception as e:
-        print(f"Error with AutoModel, trying AutoModelForCausalLM: {e}")
-        # محاولة بديلة مع AutoModelForCausalLM
-        try:
-            model = AutoModelForCausalLM.from_pretrained(
-                MODEL_ID,
-                trust_remote_code=True,  # مهم جداً!
-                torch_dtype=dtype,
-                low_cpu_mem_usage=True,
-                attn_implementation="eager"
-            ).eval()
-            if torch.cuda.is_available():
-                model = model.cuda()
-            print("Model loaded successfully with AutoModelForCausalLM!")
-        except Exception as e2:
-            print(f"Failed to load model: {e2}")
-            raise RuntimeError(f"Could not load model: {e2}")
-# =========================================================
-# دالة معالجة الصور
-# =========================================================
-def process_image(image_input):
-    """معالجة الصورة للنموذج"""
-    if image_input is None:
-        return None
-    if isinstance(image_input, str):
-        return Image.open(image_input).convert('RGB')
-    else:
-        return image_input.convert('RGB')
-# =========================================================
-# دالة الاستدلال مع ZeroGPU
-# =========================================================
-@spaces.GPU(duration=60)
 def generate_response(
-    text_input,
-    image_input,
-    temperature,
-    top_p,
-    max_new_tokens
 ):
     """
-    معالجة النص والصور باستخدام MiniCPM-o-2_6
     """
-    if not text_input and not image_input:
-        return "Please provide text or image input."
     try:
-        load_model()
-        global model, tokenizer
-        # إعداد المدخلات
         if image_input is not None:
-            # معالجة الصورة + النص
-            image = process_image(image_input)
-            if not text_input:
-                text_input = "What is shown in this image? Please describe in detail."
-            # التحقق من وجود دالة chat في النموذج
-            if hasattr(model, 'chat'):
-                try:
-                    # استخدام دالة chat المخصصة
-                    msgs = [{"role": "user", "content": [image, text_input]}]
-                    with torch.no_grad():
-                        response = model.chat(
-                            image=image,
-                            msgs=msgs,
-                            tokenizer=tokenizer,
-                            sampling=True,
-                            temperature=temperature,
-                            top_p=top_p,
-                            max_new_tokens=max_new_tokens
-                        )
-                    return response
-                except Exception as e:
-                    print(f"Chat method failed: {e}")
-                    # السقوط إلى الطريقة العادية
-            # الطريقة البديلة للصور
-            # دمج النص مع وصف الصورة
-            prompt = f"Image: [Image will be processed]\n\nQuestion: {text_input}\n\nAnswer:"
-        else:
-            # نص فقط
-            prompt = text_input
-        # المعالجة العادية للنص
-        inputs = tokenizer(
-            prompt,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=2048
         )
-        if torch.cuda.is_available():
-            inputs = {k: v.cuda() for k, v in inputs.items() if v is not None}
-        # إعدادات التوليد
-        gen_kwargs = {
-            "max_new_tokens": max_new_tokens,
-            "temperature": temperature if temperature > 0 else 1e-7,
-            "top_p": top_p,
-            "do_sample": temperature > 0,
-            "pad_token_id": tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
-            "eos_token_id": tokenizer.eos_token_id,
-        }
         # التوليد
         with torch.no_grad():
-            outputs = model.generate(**inputs, **gen_kwargs)
-        # فك التشفير
-        response = tokenizer.decode(
-            outputs[0][inputs['input_ids'].shape[1]:],
             skip_special_tokens=True
-        )
-        return response.strip()
     except Exception as e:
-        import traceback
-        traceback.print_exc()
-        return f"Error: {str(e)}"
-# =========================================================
-# دوال مساعدة للواجهة
-# =========================================================
-def clear_all():
-    """مسح جميع المدخلات والمخرجات"""
-    return "", None, ""
-def update_examples_visibility(show_examples):
-    """تحديث رؤية الأمثلة"""
-    return gr.update(visible=show_examples)
-# =========================================================
-# واجهة Gradio
-# =========================================================
-def create_demo():
-    """إنشاء واجهة Gradio البسيطة"""
-    with gr.Blocks(title="MiniCPM-o-2.6", css="""
-        .gradio-container {
-            max-width: 1200px;
-            margin: auto;
-        }
-        h1 {
-            text-align: center;
-        }
-        .contain {
-            background: white;
-            border-radius: 10px;
-            padding: 20px;
-        }
-    """) as demo:
-        gr.Markdown(
-            """
-            # 🤖 MiniCPM-o-2.6 - Multimodal AI Assistant
-            <div style="text-align: center;">
-                <p>
-                    <b>8B parameters model</b> with GPT-4 level performance<br>
-                    Supports: Text Generation, Image Understanding, OCR, and Multi-lingual conversations
-                </p>
-            </div>
-            """
-        )
-        with gr.Row():
-            # العمود الرئيسي
-            with gr.Column(scale=2):
-                with gr.Group():
-                    text_input = gr.Textbox(
-                        label="💭 Text Input",
-                        placeholder="Enter your question or prompt here...\nYou can ask about images, request text generation, or have a conversation.",
-                        lines=4,
-                        elem_id="text_input"
-                    )
-                    image_input = gr.Image(
-                        label="📷 Image Input (Optional)",
-                        type="pil",
-                        elem_id="image_input"
-                    )
-                with gr.Row():
-                    submit_btn = gr.Button(
-                        "🚀 Generate Response",
-                        variant="primary",
-                        scale=2
-                    )
-                    clear_btn = gr.Button(
-                        "🗑️ Clear All",
-                        variant="secondary",
-                        scale=1
-                    )
-                output = gr.Textbox(
-                    label="🤖 AI Response",
-                    lines=10,
-                    interactive=False,
-                    elem_id="output"
-                )
-            # عمود الإعدادات
-            with gr.Column(scale=1):
-                with gr.Group():
-                    gr.Markdown("### ⚙️ Generation Settings")
-                    temperature = gr.Slider(
-                        label="Temperature",
-                        minimum=0.0,
-                        maximum=1.5,
-                        value=0.7,
-                        step=0.1,
-                        info="Controls randomness (0=deterministic, 1.5=very creative)"
-                    )
-                    top_p = gr.Slider(
-                        label="Top-p (Nucleus Sampling)",
-                        minimum=0.1,
-                        maximum=1.0,
-                        value=0.9,
-                        step=0.05,
-                        info="Controls diversity of output"
-                    )
-                    max_new_tokens = gr.Slider(
-                        label="Max New Tokens",
-                        minimum=50,
-                        maximum=2048,
-                        value=512,
-                        step=50,
-                        info="Maximum length of generated response"
-                    )
-                gr.Markdown(
-                    """
-                    ### 📚 Quick Tips:
-                    **Text Generation:**
-                    - Ask questions
-                    - Request explanations
-                    - Generate creative content
-                    **Image Understanding:**
-                    - Upload an image
-                    - Ask about contents
-                    - Request OCR/text extraction
-                    - Get detailed descriptions
-                    **Languages:**
-                    - English, Chinese, Arabic
-                    - And many more!
-                    """
                 )
-        # أمثلة
-        with gr.Group():
-            gr.Markdown("### 💡 Example Prompts")
-            gr.Examples(
-                examples=[
-                    ["Explain quantum computing in simple terms for a beginner.", None],
-                    ["Write a short story about a robot learning to paint.", None],
-                    ["What are the main differences between Python and JavaScript?", None],
-                    ["Create a healthy meal plan for one week.", None],
-                    ["Translate 'Hello, how are you?' to French, Spanish, and Arabic.", None],
-                ],
-                inputs=[text_input, image_input],
-                outputs=output,
-                fn=lambda t, i: generate_response(t, i, 0.7, 0.9, 512),
-                cache_examples=False,
-                label="Click any example to try it"
             )
-        # ربط الأحداث
-        submit_btn.click(
-            fn=generate_response,
-            inputs=[text_input, image_input, temperature, top_p, max_new_tokens],
-            outputs=output,
-            api_name="generate"
-        )
-        text_input.submit(
-            fn=generate_response,
-            inputs=[text_input, image_input, temperature, top_p, max_new_tokens],
-            outputs=output
-        )
-        clear_btn.click(
-            fn=clear_all,
-            inputs=[],
-            outputs=[text_input, image_input, output]
-        )
-        # رسالة ترحيبية عند التحميل
-        demo.load(
-            lambda: gr.Info("Model is loading... This may take a moment on first use."),
-            inputs=None,
-            outputs=None
-        )
-    return demo
-# =========================================================
 # تشغيل التطبيق
-# =========================================================
 if __name__ == "__main__":
-    demo = create_demo()
     demo.launch(
-        ssr_mode=False,
         show_error=True,
-        share=False
-    )

 import gradio as gr
+import torch
 import spaces
 from PIL import Image
+import numpy as np
+import os
+import tempfile
+# استيراد المكتبات الضرورية من Uni-MoE
+try:
+    from uni_moe.model.processing_qwen2_vl import Qwen2VLProcessor
+    from uni_moe.model.modeling_out import GrinQwen2VLOutForConditionalGeneration
+    from uni_moe.qwen_vl_utils import process_mm_info
+    from uni_moe.model import deepspeed_moe_inference_utils
+except ImportError:
+    print("⚠️ Warning: Uni-MoE libraries not fully imported. Some features may not work.")
+# تحميل النموذج
+MODEL_NAME = "HIT-TMG/Uni-MoE-2.0-Omni"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"🚀 Loading model: {MODEL_NAME}")
+print(f"📍 Device: {device}")
+# تحميل المعالج والنموذج
+try:
+    processor = Qwen2VLProcessor.from_pretrained(MODEL_NAME)
+    model = GrinQwen2VLOutForConditionalGeneration.from_pretrained(
+        MODEL_NAME,
+        torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
+        device_map="auto"
+    )
+    if device == "cuda":
+        model = model.cuda()
+    # تعيين data_args
+    processor.data_args = model.config
+    print("✅ Model loaded successfully!")
+except Exception as e:
+    print(f"❌ Error loading model: {str(e)}")
+    processor = None
+    model = None
+@spaces.GPU(duration=120)  # استخدام ZeroGPU لمدة 120 ثانية
 def generate_response(
+    text_input: str,
+    image_input: Image.Image = None,
+    audio_input: str = None,
+    temperature: float = 1.0,
+    max_new_tokens: int = 512
 ):
     """
+    توليد استجابة من النموذج بناءً على المدخلات المختلفة
     """
+    if model is None or processor is None:
+        return "❌ النموذج غير متاح حالياً. يرجى المحاولة لاحقاً."
     try:
+        # بناء رسالة المستخدم
+        content = []
+        # إضافة النص
+        if text_input:
+            content.append({"type": "text", "text": text_input})
+        # إضافة الصورة
         if image_input is not None:
+            # حفظ الصورة مؤقتاً
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp_img:
+                image_input.save(tmp_img.name)
+                content.append({"type": "image", "image": tmp_img.name})
+        # إضافة الصوت
+        if audio_input is not None:
+            content.append({"type": "audio", "audio": audio_input})
+        if not content:
+            return "⚠️ يرجى إدخال نص أو صورة أو صوت."
+        # بناء الرسائل
+        messages = [{
+            "role": "user",
+            "content": content
+        }]
+        # معالجة الرسائل
+        texts = processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
         )
+        # استبدال العلامات الخاصة
+        texts = texts.replace(
+            "<image>", "<|vision_start|><|image_pad|><|vision_end|>"
+        ).replace(
+            "<audio>", "<|audio_start|><|audio_pad|><|audio_end|>"
+        ).replace(
+            "<video>", "<|vision_start|><|video_pad|><|vision_end|>"
+        )
+        # معالجة الوسائط
+        image_inputs, video_inputs, audio_inputs = process_mm_info(messages)
+        # تجهيز المدخلات
+        inputs = processor(
+            text=texts,
+            images=image_inputs,
+            videos=video_inputs,
+            audios=audio_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs["input_ids"] = inputs["input_ids"].unsqueeze(0)
+        inputs = inputs.to(device=model.device)
         # التوليد
         with torch.no_grad():
+            output_ids = model.generate(
+                **inputs,
+                use_cache=True,
+                pad_token_id=processor.tokenizer.eos_token_id,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                do_sample=True
+            )
+        # فك تشفير النتي��ة
+        response = processor.batch_decode(
+            output_ids[:, inputs["input_ids"].shape[-1]:],
             skip_special_tokens=True
+        )[0]
+        return response
     except Exception as e:
+        return f"❌ حدث خطأ: {str(e)}"
+# إنشاء واجهة Gradio
+with gr.Blocks(
+    title="Uni-MoE 2.0 Omni Demo",
+    theme=gr.themes.Soft(),
+    css="""
+        .rtl { direction: rtl; text-align: right; }
+        .main-header { text-align: center; margin-bottom: 2rem; }
+    """
+) as demo:
+    gr.Markdown("""
+    <div class="main-header">
+    # 🚀 Uni-MoE 2.0 Omni Demo
+    نموذج متعدد الوسائط متقدم يدعم فهم وتوليد **النصوص والصور والصوت**
+    An advanced omnimodal model supporting understanding and generation of **text, images, and audio**
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 📝 المدخلات / Inputs")
+            text_input = gr.Textbox(
+                label="النص / Text",
+                placeholder="اكتب سؤالك أو وصفك هنا... / Enter your question or description here...",
+                lines=3,
+                rtl=True
+            )
+            image_input = gr.Image(
+                label="الصورة (اختياري) / Image (Optional)",
+                type="pil"
+            )
+            audio_input = gr.Audio(
+                label="الصوت (اختياري) / Audio (Optional)",
+                type="filepath"
+            )
+            with gr.Accordion("⚙️ إعدادات متقدمة / Advanced Settings", open=False):
+                temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.1,
+                    label="Temperature"
+                )
+                max_tokens = gr.Slider(
+                    minimum=64,
+                    maximum=2048,
+                    value=512,
+                    step=64,
+                    label="Max New Tokens"
                 )
+            submit_btn = gr.Button("🎯 توليد / Generate", variant="primary")
+            clear_btn = gr.Button("🗑️ مسح / Clear")
+        with gr.Column(scale=1):
+            gr.Markdown("### 💬 النتيجة / Output")
+            output = gr.Textbox(
+                label="الاستجابة / Response",
+                lines=15,
+                show_copy_button=True,
+                rtl=True
             )
+    # أمثلة
+    gr.Markdown("### 📚 أمثلة / Examples")
+    gr.Examples(
+        examples=[
+            ["ما هي عاصمة مصر؟", None, None],
+            ["صف هذه الصورة بالتفصيل", "https://picsum.photos/400/300", None],
+            ["What is the capital of France?", None, None],
+            ["Describe this image in detail", "https://picsum.photos/400/300", None],
+        ],
+        inputs=[text_input, image_input, audio_input],
+    )
+    # معلومات إضافية
+    gr.Markdown("""
+    ---
+    ### ℹ️ معلومات / Information
+    **Uni-MoE 2.0 Omni** هو نموذج لغوي متعدد الوسائط (Omnimodal) مبني على معماريات:
+    - 🧠 **Mixture-of-Experts (MoE)** لكفاءة الحوسبة
+    - 🔄 **Qwen2.5-7B** كقاعدة أساسية
+    - 🎯 **Omni-Modality 3D RoPE** لمحاذاة متعددة الوسائط
+    **القدرات:**
+    - ✅ فهم النصوص والصور والصوت والفيديو
+    - ✅ توليد النصوص والصور والصوت
+    - ✅ استدلال متعدد الوسائط
+    📄 **ورقة بحثية:** [arXiv:2511.12609](https://arxiv.org/abs/2511.12609)
+    🔗 **GitHub:** [HITsz-TMG/Uni-MoE](https://github.com/HITsz-TMG/Uni-MoE)
+    ---
+    <p style="text-align: center; color: #666;">
+    تم إنشاؤه باستخدام Gradio و ZeroGPU 🚀
+    </p>
+    """)
+    # ربط الأحداث
+    submit_btn.click(
+        fn=generate_response,
+        inputs=[text_input, image_input, audio_input, temperature, max_tokens],
+        outputs=output
+    )
+    clear_btn.click(
+        fn=lambda: (None, None, None, None),
+        outputs=[text_input, image_input, audio_input, output]
+    )
 # تشغيل التطبيق
 if __name__ == "__main__":
+    demo.queue(max_size=10)
     demo.launch(
+        share=False,
         show_error=True,
+        server_name="0.0.0.0",
+        server_port=7860
+    )