|
|
""" |
|
|
نسخة محسّنة من app.py مع دعم Quantization و Memory Optimization |
|
|
للنماذج الكبيرة على ZeroGPU |
|
|
|
|
|
Optimized version of app.py with Quantization and Memory Optimization |
|
|
for large models on ZeroGPU |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import torch |
|
|
import spaces |
|
|
from PIL import Image |
|
|
import os |
|
|
import tempfile |
|
|
import gc |
|
|
from typing import Optional, Union |
|
|
|
|
|
|
|
|
try: |
|
|
from uni_moe.model.processing_qwen2_vl import Qwen2VLProcessor |
|
|
from uni_moe.model.modeling_out import GrinQwen2VLOutForConditionalGeneration |
|
|
from uni_moe.qwen_vl_utils import process_mm_info |
|
|
from transformers import BitsAndBytesConfig |
|
|
except ImportError as e: |
|
|
print(f"⚠️ Warning: Import error - {e}") |
|
|
print("Some features may not work properly.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_NAME = "HIT-TMG/Uni-MoE-2.0-Omni" |
|
|
|
|
|
|
|
|
|
|
|
USE_4BIT = True |
|
|
USE_8BIT = False |
|
|
USE_FLASH_ATTENTION = True |
|
|
MAX_MEMORY = "20GB" |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
|
|
|
|
|
|
print("="*60) |
|
|
print(f"🚀 Loading Uni-MoE 2.0 Model") |
|
|
print(f"📍 Model: {MODEL_NAME}") |
|
|
print(f"🖥️ Device: {device}") |
|
|
print(f"⚙️ 4-bit Quantization: {USE_4BIT}") |
|
|
print(f"⚙️ 8-bit Quantization: {USE_8BIT}") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
def load_model_optimized(): |
|
|
"""تحميل النموذج بطريقة محسّنة""" |
|
|
global processor, model |
|
|
|
|
|
try: |
|
|
|
|
|
print("📥 Loading processor...") |
|
|
processor = Qwen2VLProcessor.from_pretrained(MODEL_NAME) |
|
|
|
|
|
|
|
|
quantization_config = None |
|
|
if USE_4BIT: |
|
|
print("⚙️ Setting up 4-bit quantization...") |
|
|
quantization_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_compute_dtype=torch.float16, |
|
|
bnb_4bit_use_double_quant=True, |
|
|
bnb_4bit_quant_type="nf4" |
|
|
) |
|
|
elif USE_8BIT: |
|
|
print("⚙️ Setting up 8-bit quantization...") |
|
|
quantization_config = BitsAndBytesConfig( |
|
|
load_in_8bit=True, |
|
|
) |
|
|
|
|
|
|
|
|
print("📥 Loading model (this may take a few minutes)...") |
|
|
load_kwargs = { |
|
|
"device_map": "auto", |
|
|
"torch_dtype": torch.float16 if not USE_4BIT else None, |
|
|
"trust_remote_code": True, |
|
|
} |
|
|
|
|
|
if quantization_config: |
|
|
load_kwargs["quantization_config"] = quantization_config |
|
|
|
|
|
if device == "cuda" and not USE_4BIT and not USE_8BIT: |
|
|
load_kwargs["max_memory"] = {0: MAX_MEMORY} |
|
|
|
|
|
model = GrinQwen2VLOutForConditionalGeneration.from_pretrained( |
|
|
MODEL_NAME, |
|
|
**load_kwargs |
|
|
) |
|
|
|
|
|
|
|
|
processor.data_args = model.config |
|
|
|
|
|
print("✅ Model loaded successfully!") |
|
|
print(f"💾 Model size: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B parameters") |
|
|
|
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Error loading model: {str(e)}") |
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
model_loaded = load_model_optimized() |
|
|
if not model_loaded: |
|
|
processor = None |
|
|
model = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clear_gpu_memory(): |
|
|
"""تنظيف ذاكرة GPU""" |
|
|
if torch.cuda.is_available(): |
|
|
torch.cuda.empty_cache() |
|
|
gc.collect() |
|
|
|
|
|
|
|
|
def estimate_tokens(text: str) -> int: |
|
|
"""تقدير عدد التوكنات""" |
|
|
return len(text.split()) * 1.3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@spaces.GPU(duration=120) |
|
|
def generate_response( |
|
|
text_input: str, |
|
|
image_input: Optional[Image.Image] = None, |
|
|
audio_input: Optional[str] = None, |
|
|
temperature: float = 1.0, |
|
|
max_new_tokens: int = 512, |
|
|
top_p: float = 0.9, |
|
|
repetition_penalty: float = 1.1 |
|
|
) -> str: |
|
|
""" |
|
|
توليد استجابة من النموذج |
|
|
Generate response from the model |
|
|
""" |
|
|
|
|
|
|
|
|
if model is None or processor is None: |
|
|
return "❌ النموذج غير متاح. يرجى التحقق من السجلات.\n❌ Model not available. Please check logs." |
|
|
|
|
|
|
|
|
clear_gpu_memory() |
|
|
|
|
|
try: |
|
|
|
|
|
if not text_input and image_input is None and audio_input is None: |
|
|
return "⚠️ يرجى إدخال نص أو صورة أو صوت على الأقل.\n⚠️ Please provide at least text, image, or audio input." |
|
|
|
|
|
|
|
|
content = [] |
|
|
|
|
|
|
|
|
if text_input: |
|
|
content.append({"type": "text", "text": text_input}) |
|
|
|
|
|
|
|
|
temp_image_path = None |
|
|
if image_input is not None: |
|
|
temp_image_path = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg").name |
|
|
image_input.save(temp_image_path) |
|
|
content.append({"type": "image", "image": temp_image_path}) |
|
|
|
|
|
|
|
|
if audio_input is not None: |
|
|
content.append({"type": "audio", "audio": audio_input}) |
|
|
|
|
|
|
|
|
messages = [{"role": "user", "content": content}] |
|
|
|
|
|
|
|
|
texts = processor.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=True |
|
|
) |
|
|
|
|
|
|
|
|
texts = texts.replace( |
|
|
"<image>", "<|vision_start|><|image_pad|><|vision_end|>" |
|
|
).replace( |
|
|
"<audio>", "<|audio_start|><|audio_pad|><|audio_end|>" |
|
|
).replace( |
|
|
"<video>", "<|vision_start|><|video_pad|><|vision_end|>" |
|
|
) |
|
|
|
|
|
|
|
|
image_inputs, video_inputs, audio_inputs = process_mm_info(messages) |
|
|
|
|
|
|
|
|
inputs = processor( |
|
|
text=texts, |
|
|
images=image_inputs, |
|
|
videos=video_inputs, |
|
|
audios=audio_inputs, |
|
|
padding=True, |
|
|
return_tensors="pt", |
|
|
) |
|
|
|
|
|
inputs["input_ids"] = inputs["input_ids"].unsqueeze(0) |
|
|
inputs = inputs.to(device=model.device) |
|
|
|
|
|
|
|
|
with torch.inference_mode(): |
|
|
output_ids = model.generate( |
|
|
**inputs, |
|
|
use_cache=True, |
|
|
pad_token_id=processor.tokenizer.eos_token_id, |
|
|
max_new_tokens=max_new_tokens, |
|
|
temperature=temperature, |
|
|
do_sample=True, |
|
|
top_p=top_p, |
|
|
repetition_penalty=repetition_penalty |
|
|
) |
|
|
|
|
|
|
|
|
response = processor.batch_decode( |
|
|
output_ids[:, inputs["input_ids"].shape[-1]:], |
|
|
skip_special_tokens=True |
|
|
)[0] |
|
|
|
|
|
|
|
|
if temp_image_path and os.path.exists(temp_image_path): |
|
|
os.unlink(temp_image_path) |
|
|
|
|
|
|
|
|
clear_gpu_memory() |
|
|
|
|
|
return response |
|
|
|
|
|
except Exception as e: |
|
|
clear_gpu_memory() |
|
|
error_msg = f"❌ خطأ / Error: {str(e)}" |
|
|
print(error_msg) |
|
|
return error_msg |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
css = """ |
|
|
.rtl { direction: rtl; text-align: right; } |
|
|
.main-header { |
|
|
text-align: center; |
|
|
margin-bottom: 2rem; |
|
|
padding: 2rem; |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
|
border-radius: 10px; |
|
|
color: white; |
|
|
} |
|
|
.note-box { |
|
|
padding: 1rem; |
|
|
background: #f0f9ff; |
|
|
border-left: 4px solid #3b82f6; |
|
|
border-radius: 4px; |
|
|
margin: 1rem 0; |
|
|
} |
|
|
""" |
|
|
|
|
|
with gr.Blocks(title="Uni-MoE 2.0 Omni - Optimized", theme=gr.themes.Soft(), css=css) as demo: |
|
|
|
|
|
gr.HTML(""" |
|
|
<div class="main-header"> |
|
|
<h1>🚀 Uni-MoE 2.0 Omni Demo</h1> |
|
|
<p style="font-size: 1.1em; margin-top: 1rem;"> |
|
|
نموذج متعدد الوسائط متقدم - Advanced Omnimodal Model |
|
|
</p> |
|
|
<p style="font-size: 0.9em; opacity: 0.9; margin-top: 0.5rem;"> |
|
|
يدعم فهم وتوليد النصوص والصور والصوت<br> |
|
|
Supports understanding and generation of text, images, and audio |
|
|
</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### 📝 المدخلات / Inputs") |
|
|
|
|
|
text_input = gr.Textbox( |
|
|
label="النص / Text", |
|
|
placeholder="اكتب سؤالك أو وصفك هنا...\nEnter your question or description here...", |
|
|
lines=4, |
|
|
rtl=True |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
image_input = gr.Image( |
|
|
label="الصورة (اختياري) / Image (Optional)", |
|
|
type="pil", |
|
|
height=300 |
|
|
) |
|
|
|
|
|
audio_input = gr.Audio( |
|
|
label="الصوت (اختياري) / Audio (Optional)", |
|
|
type="filepath" |
|
|
) |
|
|
|
|
|
with gr.Accordion("⚙️ إعدادات متقدمة / Advanced Settings", open=False): |
|
|
temperature = gr.Slider( |
|
|
minimum=0.1, maximum=2.0, value=0.7, step=0.1, |
|
|
label="Temperature (الإبداعية / Creativity)" |
|
|
) |
|
|
max_tokens = gr.Slider( |
|
|
minimum=64, maximum=2048, value=512, step=64, |
|
|
label="Max Tokens (الطول الأقصى / Max Length)" |
|
|
) |
|
|
top_p = gr.Slider( |
|
|
minimum=0.1, maximum=1.0, value=0.9, step=0.05, |
|
|
label="Top P (التنوع / Diversity)" |
|
|
) |
|
|
repetition_penalty = gr.Slider( |
|
|
minimum=1.0, maximum=2.0, value=1.1, step=0.1, |
|
|
label="Repetition Penalty (تجنب التكرار / Avoid Repetition)" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
submit_btn = gr.Button("🎯 توليد / Generate", variant="primary", size="lg") |
|
|
clear_btn = gr.Button("🗑️ مسح / Clear", size="lg") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### 💬 النتيجة / Output") |
|
|
|
|
|
output = gr.Textbox( |
|
|
label="الاستجابة / Response", |
|
|
lines=20, |
|
|
show_copy_button=True, |
|
|
rtl=True |
|
|
) |
|
|
|
|
|
|
|
|
gr.HTML(""" |
|
|
<div class="note-box"> |
|
|
<h3>📌 ملاحظات مهمة / Important Notes</h3> |
|
|
<ul> |
|
|
<li>⏱️ قد يستغرق التوليد 30-60 ثانية / Generation may take 30-60 seconds</li> |
|
|
<li>💾 يستخدم النموذج quantization لتوفير الذاكرة / Model uses quantization to save memory</li> |
|
|
<li>🔄 يتم تنظيف الذاكرة تلقائياً بعد كل استخدام / Memory is cleared automatically after each use</li> |
|
|
</ul> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
gr.Markdown("### 📚 أمثلة / Examples") |
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["ما هي عاصمة مصر؟ What is the capital of Egypt?", None, None], |
|
|
["صف هذه الصورة بالتفصيل\nDescribe this image in detail", "https://picsum.photos/400/300", None], |
|
|
["قارن بين Python و JavaScript\nCompare Python and JavaScript", None, None], |
|
|
], |
|
|
inputs=[text_input, image_input, audio_input], |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
### ℹ️ حول النموذج / About the Model |
|
|
|
|
|
**Uni-MoE 2.0 Omni** بني على: |
|
|
- 🧠 Mixture-of-Experts (MoE) architecture |
|
|
- 📊 Qwen2.5-7B base model (~33B parameters with experts) |
|
|
- 🌐 Omni-Modality 3D RoPE for cross-modal alignment |
|
|
- ⚡ Dynamic-Capacity routing mechanism |
|
|
|
|
|
**الأداء / Performance:** |
|
|
- ✅ +7% على فهم الفيديو / video understanding |
|
|
- ✅ +4% على الاستدلال السمعي-البصري / audio-visual reasoning |
|
|
- ✅ متفوق على Qwen2.5-Omni في 50+ معياراً / benchmarks |
|
|
|
|
|
📄 [ورقة بحثية / Paper](https://arxiv.org/abs/2511.12609) | |
|
|
💻 [GitHub](https://github.com/HITsz-TMG/Uni-MoE) | |
|
|
🤗 [Model](https://huggingface.co/HIT-TMG/Uni-MoE-2.0-Omni) |
|
|
""") |
|
|
|
|
|
|
|
|
submit_btn.click( |
|
|
fn=generate_response, |
|
|
inputs=[text_input, image_input, audio_input, temperature, max_tokens, top_p, repetition_penalty], |
|
|
outputs=output |
|
|
) |
|
|
|
|
|
clear_btn.click( |
|
|
fn=lambda: (None, None, None, None), |
|
|
outputs=[text_input, image_input, audio_input, output] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.queue(max_size=20, default_concurrency_limit=5) |
|
|
demo.launch( |
|
|
share=False, |
|
|
show_error=True, |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860 |
|
|
) |
|
|
|