Spaces:

Derr11
/

Der11

Paused

App Files Files Community

Derr11 commited on 23 days ago

Commit

9c85028

verified ·

1 Parent(s): 7c953da

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -122

app.py CHANGED Viewed

@@ -7,25 +7,46 @@ import spaces
 from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
 from qwen_omni_utils import process_mm_info
-# ==========================
 # إعدادات عامة
-# ==========================
 MODEL_PATH = os.getenv("MODEL_PATH", "Qwen/Qwen3-Omni-30B-A3B-Instruct")
-USE_AUDIO_IN_VIDEO = True  # استخدام الصوت داخل الفيديو لو وجد
 VOICE_CHOICES = ["Ethan", "Chelsie", "Aiden"]
 DEFAULT_VOICE = "Ethan"
-# تحميل كسول
 model = None
 processor = None
 def load_model():
     """
-    تحميل Qwen3-Omni والمعالج عند أول استدعاء فقط (على ZeroGPU).
-    تم إلغاء flash_attention_2 و device_map='auto' لتجنب مشاكل الاستدعاء.
     """
     global model, processor
@@ -34,7 +55,7 @@ def load_model():
     print(f"[ZeroGPU] Loading model from: {MODEL_PATH}")
-    # نحدد نوع البيانات والجهاز
     if torch.cuda.is_available():
         torch_dtype = torch.bfloat16
         device = "cuda"
@@ -42,13 +63,12 @@ def load_model():
         torch_dtype = torch.float32
         device = "cpu"
-    # تحميل النموذج بدون flash_attention_2 ولا device_map="auto"
     local_model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
         MODEL_PATH,
         torch_dtype=torch_dtype,
-        attn_implementation="eager",  # الأكثر أماناً في هذه البيئة
     )
     local_model.to(device)
     local_processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)
@@ -58,33 +78,47 @@ def load_model():
     print(f"[ZeroGPU] Model loaded on {device} with dtype {torch_dtype}.")
-def build_messages_from_history(history, system_prompt, user_text, image, audio_path, video_path):
     """
-    تحويل تاريخ الدردشة + المدخل الحالي إلى تنسيق الرسائل المطلوب من Qwen3-Omni.
     history: list of [user_text, assistant_text]
     """
     messages = []
     if system_prompt:
-        messages.append({
-            "role": "system",
-            "content": [{"type": "text", "text": system_prompt}],
-        })
     # تاريخ المحادثة
     for user_msg, assistant_msg in history:
         if user_msg:
-            messages.append({
-                "role": "user",
-                "content": [{"type": "text", "text": user_msg}],
-            })
         if assistant_msg:
-            messages.append({
-                "role": "assistant",
-                "content": [{"type": "text", "text": assistant_msg}],
-            })
-    # الرسالة الحالية (وسائط + نص)
     user_content = []
     if image is not None:
@@ -100,17 +134,19 @@ def build_messages_from_history(history, system_prompt, user_text, image, audio_
         user_content.append({"type": "text", "text": user_text.strip()})
     if user_content:
-        messages.append({
-            "role": "user",
-            "content": user_content,
-        })
     return messages
-# ==========================
-# دالة الاستدلال (ZeroGPU)
-# ==========================
 @spaces.GPU(duration=120)
 def qwen3_omni_inference(
@@ -127,13 +163,13 @@ def qwen3_omni_inference(
     max_tokens,
 ):
     """
-    تنفيذ الاستدلال الفعلي على ZeroGPU:
-    - نص + صورة + صوت + فيديو
-    - مخرج نصي دائماً، وصوتي عند الحاجة
     """
     if not (user_text or image is not None or audio_path or video_path):
-        # لا يوجد مدخل من المستخدم
         return history, None, "", None, None, None
     load_model()
@@ -148,14 +184,14 @@ def qwen3_omni_inference(
         video_path=video_path,
     )
-    # بناء النص من المحادثة (chat template)
     text_prompt = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=False,
     )
-    # تجهيز الوسائط المتعددة (صوت/صورة/فيديو)
     audios, images, videos = process_mm_info(
         messages,
         use_audio_in_video=USE_AUDIO_IN_VIDEO,
@@ -172,13 +208,13 @@ def qwen3_omni_inference(
         use_audio_in_video=USE_AUDIO_IN_VIDEO,
     )
-    # نقل المدخلات إلى نفس الجهاز ونفس dtype للنموذج
     first_param = next(model.parameters())
     device = first_param.device
     dtype = first_param.dtype
     inputs = inputs.to(device=device, dtype=dtype)
-    # بارامترات التوليد
     gen_kwargs = dict(
         temperature=float(temperature),
         top_p=float(top_p),
@@ -187,16 +223,16 @@ def qwen3_omni_inference(
         use_audio_in_video=USE_AUDIO_IN_VIDEO,
     )
-    # توليد النص فقط أو نص + صوت
     if not return_audio:
         gen_kwargs["return_audio"] = False
-        text_ids, _ = model.generate(**inputs, **gen_kwargs)
         audio_out = None
     else:
         gen_kwargs["speaker"] = speaker
         text_ids, audio_out = model.generate(**inputs, **gen_kwargs)
-    # فك ترميز النص الناتج
     input_len = inputs["input_ids"].shape[1]
     generated_text = processor.batch_decode(
         text_ids.sequences[:, input_len:],
@@ -205,10 +241,12 @@ def qwen3_omni_inference(
     )[0]
     # تحديث تاريخ الدردشة
-    user_display = user_text if (user_text and user_text.strip()) else "[Multimodal message]"
     history = history + [[user_display, generated_text]]
-    # تجهيز الصوت لمخرج Gradio (إن وجد)
     gr_audio = None
     if audio_out is not None:
         audio_np = audio_out.reshape(-1).detach().cpu().numpy()
@@ -219,12 +257,12 @@ def qwen3_omni_inference(
     return history, gr_audio, "", None, None, None
-# ==========================
-# دوال واجهة
-# ==========================
 def clear_chat():
-    """مسح المحادثة والصوت."""
     return [], None
@@ -236,15 +274,15 @@ def create_interface():
             """
             <h1 style="text-align:center;">Qwen3-Omni-30B-A3B – ZeroGPU Chat</h1>
             <p style="text-align:center;">
-            دردشة متعددة الوسائط (نص + صورة + صوت + فيديو) تعمل على ZeroGPU.
-            استخدم حقل الرسالة مع المرفقات بالأسفل ثم اضغط <b>إرسال</b> أو Enter.
             (لإضافة سطر جديد استخدم Shift+Enter)
             </p>
             """
         )
         with gr.Row():
-            # عمود المحادثة
             with gr.Column(scale=3):
                 chatbot = gr.Chatbot(
                     label="المحادثة",
@@ -257,7 +295,6 @@ def create_interface():
                     autoplay=True,
                 )
-                # منطقة الإدخال أسفل الشات
                 with gr.Row():
                     user_text = gr.Textbox(
                         label="رسالتك",
@@ -286,7 +323,7 @@ def create_interface():
                     send_btn = gr.Button("إرسال", variant="primary", scale=2)
                     clear_btn = gr.Button("مسح المحادثة", variant="secondary")
-            # عمود الإعدادات
             with gr.Column(scale=1):
                 gr.Markdown("### إعدادات النموذج")
@@ -335,78 +372,78 @@ def create_interface():
                 gr.Markdown(
                     """
-                    **ملاحظات الاستخدام:**
-                    - يمكنك إرسال نص فقط، أو نص مع صورة/صوت/فيديو في نفس الرسالة.
-                    - اضغط Enter للإرسال، وShift+Enter لسطر جديد.
-                    - ZeroGPU قد يستغرق عدة ثوانٍ لكل رد حسب طول الرسالة.
                     """
                 )
-            # حالة المحادثة
-            history_state = gr.State([])
-            # دالة مشتركة للإرسال (زر + Enter)
-            send_inputs = [
-                history_state,
-                user_text,
-                image_input,
-                audio_input,
-                video_input,
-                system_prompt,
-                return_audio,
-                speaker,
-                temperature,
-                top_p,
-                max_tokens,
-            ]
-            send_outputs = [
-                history_state,
-                audio_output,
-                user_text,
-                image_input,
-                audio_input,
-                video_input,
-            ]
-            # زر إرسال
-            send_btn.click(
-                fn=qwen3_omni_inference,
-                inputs=send_inputs,
-                outputs=send_outputs,
-                queue=True,
-            ).then(
-                lambda h: h,
-                inputs=history_state,
-                outputs=chatbot,
-            )
-            # إرسال بالـ Enter من داخل الـ Textbox
-            user_text.submit(
-                fn=qwen3_omni_inference,
-                inputs=send_inputs,
-                outputs=send_outputs,
-                queue=True,
-            ).then(
-                lambda h: h,
-                inputs=history_state,
-                outputs=chatbot,
-            )
-            # مسح المحادثة
-            clear_btn.click(
-                fn=clear_chat,
-                inputs=None,
-                outputs=[history_state, audio_output],
-            ).then(
-                lambda: [],
-                inputs=None,
-                outputs=chatbot,
-            ).then(
-                lambda: ("", None, None),
-                inputs=None,
-                outputs=[user_text, image_input, audio_input],
-            )
     return demo
@@ -414,5 +451,5 @@ def create_interface():
 demo = create_interface()
 if __name__ == "__main__":
-    # نغلق SSR لأنه تجريبي ويسبب أحياناً مشاكل عرض
     demo.launch(ssr_mode=False)

 from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
 from qwen_omni_utils import process_mm_info
+# =========================================================
+# Patch لتجاوز مشكلة lm_head في Qwen3OmniMoeTalker*
+# =========================================================
+def _patched_mark_tied_weights_as_initialized(self):
+    """
+    بعض إصدارات transformers + Qwen3-Omni تسبب خطأ:
+    Qwen3OmniMoeTalkerForConditionalGeneration has no attribute `lm_head`
+    عند محاولة ربط الـ tied weights.
+    هذا الـ patch يجعل هذه الخطوة no-op (لا تقوم بشيء)،
+    وهو آمن للاستنتاج (inference) في معظم الحالات.
+    """
+    return
+# تطبيق الـ patch قبل أي استدعاء لـ from_pretrained
+if hasattr(Qwen3OmniMoeForConditionalGeneration, "mark_tied_weights_as_initialized"):
+    Qwen3OmniMoeForConditionalGeneration.mark_tied_weights_as_initialized = (
+        _patched_mark_tied_weights_as_initialized
+    )
+# =========================================================
 # إعدادات عامة
+# =========================================================
 MODEL_PATH = os.getenv("MODEL_PATH", "Qwen/Qwen3-Omni-30B-A3B-Instruct")
+USE_AUDIO_IN_VIDEO = True  # استخدام الصوت داخل الفيديو إذا وجد
 VOICE_CHOICES = ["Ethan", "Chelsie", "Aiden"]
 DEFAULT_VOICE = "Ethan"
+# سنحمّل النموذج كسولياً (عند أول استدعاء فقط)
 model = None
 processor = None
 def load_model():
     """
+    تحميل Qwen3-Omni والمعالج عند أول استدعاء فقط.
+    - نستخدم attn_implementation="eager" لتفادي الحاجة لـ flash-attn.
+    - لا نستخدم device_map="auto" لتفادي مشاكل توزيع الذاكرة على ZeroGPU.
     """
     global model, processor
     print(f"[ZeroGPU] Loading model from: {MODEL_PATH}")
+    # اختيار نوع البيانات والجهاز
     if torch.cuda.is_available():
         torch_dtype = torch.bfloat16
         device = "cuda"
         torch_dtype = torch.float32
         device = "cpu"
+    # تحميل النموذج (بدون flash_attention_2)
     local_model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
         MODEL_PATH,
         torch_dtype=torch_dtype,
+        attn_implementation="eager",  # آمن على ZeroGPU بدون flash-attn
     )
     local_model.to(device)
     local_processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)
     print(f"[ZeroGPU] Model loaded on {device} with dtype {torch_dtype}.")
+def build_messages_from_history(
+    history,
+    system_prompt,
+    user_text,
+    image,
+    audio_path,
+    video_path,
+):
     """
+    تحويل تاريخ الدردشة + المدخل الحالي إلى conversation بالـ format
+    المطلوب من Qwen3-Omni.
     history: list of [user_text, assistant_text]
     """
     messages = []
     if system_prompt:
+        messages.append(
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": system_prompt}],
+            }
+        )
     # تاريخ المحادثة
     for user_msg, assistant_msg in history:
         if user_msg:
+            messages.append(
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": user_msg}],
+                }
+            )
         if assistant_msg:
+            messages.append(
+                {
+                    "role": "assistant",
+                    "content": [{"type": "text", "text": assistant_msg}],
+                }
+            )
+    # محتوى رسالة المستخدم الحالية
     user_content = []
     if image is not None:
         user_content.append({"type": "text", "text": user_text.strip()})
     if user_content:
+        messages.append(
+            {
+                "role": "user",
+                "content": user_content,
+            }
+        )
     return messages
+# =========================================================
+# دالة الاستدلال (تعمل على ZeroGPU)
+# =========================================================
 @spaces.GPU(duration=120)
 def qwen3_omni_inference(
     max_tokens,
 ):
     """
+    - تنفيذ الاستدلال على ZeroGPU.
+    - يدعم نص + صورة + صوت + فيديو في نفس الرسالة.
+    - مخرج نصي دائماً، ومخرج صوتي اختياري.
     """
+    # في حالة عدم وجود مداخل من المستخدم
     if not (user_text or image is not None or audio_path or video_path):
         return history, None, "", None, None, None
     load_model()
         video_path=video_path,
     )
+    # بناء نص المحادثة باستخدام chat_template
     text_prompt = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=False,
     )
+    # تجهيز الوسائط المتعددة
     audios, images, videos = process_mm_info(
         messages,
         use_audio_in_video=USE_AUDIO_IN_VIDEO,
         use_audio_in_video=USE_AUDIO_IN_VIDEO,
     )
+    # نقل إلى جهاز النموذج ونفس dtype
     first_param = next(model.parameters())
     device = first_param.device
     dtype = first_param.dtype
     inputs = inputs.to(device=device, dtype=dtype)
+    # إعدادات التوليد
     gen_kwargs = dict(
         temperature=float(temperature),
         top_p=float(top_p),
         use_audio_in_video=USE_AUDIO_IN_VIDEO,
     )
+    # توليد نص فقط أو نص + صوت
     if not return_audio:
         gen_kwargs["return_audio"] = False
+        text_ids, audio_out = model.generate(**inputs, **gen_kwargs)
         audio_out = None
     else:
         gen_kwargs["speaker"] = speaker
         text_ids, audio_out = model.generate(**inputs, **gen_kwargs)
+    # استخراج النص الناتج (بدون مدخل prompt)
     input_len = inputs["input_ids"].shape[1]
     generated_text = processor.batch_decode(
         text_ids.sequences[:, input_len:],
     )[0]
     # تحديث تاريخ الدردشة
+    user_display = (
+        user_text if (user_text and user_text.strip()) else "[Multimodal message]"
+    )
     history = history + [[user_display, generated_text]]
+    # تجهيز الصوت الناتج إن وجد
     gr_audio = None
     if audio_out is not None:
         audio_np = audio_out.reshape(-1).detach().cpu().numpy()
     return history, gr_audio, "", None, None, None
+# =========================================================
+# دوال واجهة Gradio
+# =========================================================
 def clear_chat():
+    """إعادة تعيين المحادثة ومخرج الصوت."""
     return [], None
             """
             <h1 style="text-align:center;">Qwen3-Omni-30B-A3B – ZeroGPU Chat</h1>
             <p style="text-align:center;">
+            دردشة متعددة الوسائط (نص + صورة + صوت + فيديو) تعمل على ZeroGPU.<br/>
+            اكتب رسالتك، ويمكنك إضافة صورة/صوت/فيديو، ثم اضغط <b>إرسال</b> أو Enter.<br/>
             (لإضافة سطر جديد استخدم Shift+Enter)
             </p>
             """
         )
         with gr.Row():
+            # العمود الأيسر: المحادثة
             with gr.Column(scale=3):
                 chatbot = gr.Chatbot(
                     label="المحادثة",
                     autoplay=True,
                 )
                 with gr.Row():
                     user_text = gr.Textbox(
                         label="رسالتك",
                     send_btn = gr.Button("إرسال", variant="primary", scale=2)
                     clear_btn = gr.Button("مسح المحادثة", variant="secondary")
+            # العمود الأيمن: الإعدادات
             with gr.Column(scale=1):
                 gr.Markdown("### إعدادات النموذج")
                 gr.Markdown(
                     """
+                    **ملاحظات:**
+                    - يمكنك إرسال نص فقط، أو نص مع صورة/صوت/فيديو في رسالة واحدة.
+                    - Enter للإرسال، وShift+Enter لسطر جديد.
+                    - تشغيل النموذج على ZeroGPU قد يستغرق عدة ثوانٍ حسب طول الرسالة.
                     """
                 )
+        # حالة المحادثة
+        history_state = gr.State([])
+        # مدخلات دالة الإرسال
+        send_inputs = [
+            history_state,
+            user_text,
+            image_input,
+            audio_input,
+            video_input,
+            system_prompt,
+            return_audio,
+            speaker,
+            temperature,
+            top_p,
+            max_tokens,
+        ]
+        send_outputs = [
+            history_state,
+            audio_output,
+            user_text,
+            image_input,
+            audio_input,
+            video_input,
+        ]
+        # إرسال بالزر
+        send_btn.click(
+            fn=qwen3_omni_inference,
+            inputs=send_inputs,
+            outputs=send_outputs,
+            queue=True,
+        ).then(
+            lambda h: h,
+            inputs=history_state,
+            outputs=chatbot,
+        )
+        # إرسال بالـ Enter من Textbox
+        user_text.submit(
+            fn=qwen3_omni_inference,
+            inputs=send_inputs,
+            outputs=send_outputs,
+            queue=True,
+        ).then(
+            lambda h: h,
+            inputs=history_state,
+            outputs=chatbot,
+        )
+        # مسح المحادثة
+        clear_btn.click(
+            fn=clear_chat,
+            inputs=None,
+            outputs=[history_state, audio_output],
+        ).then(
+            lambda: [],
+            inputs=None,
+            outputs=chatbot,
+        ).then(
+            lambda: ("", None, None),
+            inputs=None,
+            outputs=[user_text, image_input, audio_input],
+        )
     return demo
 demo = create_interface()
 if __name__ == "__main__":
+    # إيقاف SSR لأنه تجريبي وقد يسبب مشكلة "Starting..."
     demo.launch(ssr_mode=False)