Derr11 commited on
Commit
210a758
·
verified ·
1 Parent(s): ce07ef2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +227 -346
app.py CHANGED
@@ -1,392 +1,273 @@
1
- import os
2
- import torch
3
  import gradio as gr
 
4
  import spaces
5
  from PIL import Image
6
- from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
7
- import warnings
8
- warnings.filterwarnings("ignore")
9
-
10
- # =========================================================
11
- # إعدادات النموذج
12
- # =========================================================
13
 
14
- MODEL_ID = "openbmb/MiniCPM-o-2_6"
 
 
 
 
 
 
 
15
 
16
- # تحميل كسول للنموذج
17
- model = None
18
- tokenizer = None
19
 
 
 
20
 
21
- def load_model():
22
- """تحميل النموذج عند الحاجة فقط"""
23
- global model, tokenizer
24
-
25
- if model is not None:
26
- return
 
 
 
 
27
 
28
- print(f"Loading {MODEL_ID}...")
29
-
30
- # استخدام float16 للتوافق مع ZeroGPU
31
- device = "cuda" if torch.cuda.is_available() else "cpu"
32
- dtype = torch.float16 if torch.cuda.is_available() else torch.float32
33
-
34
- try:
35
- # تحميل tokenizer أولاً
36
- tokenizer = AutoTokenizer.from_pretrained(
37
- MODEL_ID,
38
- trust_remote_code=True,
39
- use_fast=False
40
- )
41
-
42
- # تحميل النموذج مع trust_remote_code=True
43
- model = AutoModel.from_pretrained(
44
- MODEL_ID,
45
- trust_remote_code=True,
46
- torch_dtype=dtype,
47
- low_cpu_mem_usage=True,
48
- attn_implementation="eager",
49
- ).eval()
50
-
51
- if torch.cuda.is_available():
52
- model = model.cuda()
53
-
54
- print("Model loaded successfully!")
55
-
56
- except Exception as e:
57
- print(f"Error with AutoModel, trying AutoModelForCausalLM: {e}")
58
-
59
- # محاولة بديلة مع AutoModelForCausalLM
60
- try:
61
- model = AutoModelForCausalLM.from_pretrained(
62
- MODEL_ID,
63
- trust_remote_code=True, # مهم جداً!
64
- torch_dtype=dtype,
65
- low_cpu_mem_usage=True,
66
- attn_implementation="eager"
67
- ).eval()
68
-
69
- if torch.cuda.is_available():
70
- model = model.cuda()
71
-
72
- print("Model loaded successfully with AutoModelForCausalLM!")
73
-
74
- except Exception as e2:
75
- print(f"Failed to load model: {e2}")
76
- raise RuntimeError(f"Could not load model: {e2}")
77
-
78
 
79
- # =========================================================
80
- # دالة معالجة الصور
81
- # =========================================================
82
-
83
- def process_image(image_input):
84
- """معالجة الصورة للنموذج"""
85
- if image_input is None:
86
- return None
87
-
88
- if isinstance(image_input, str):
89
- return Image.open(image_input).convert('RGB')
90
- else:
91
- return image_input.convert('RGB')
92
 
93
-
94
- # =========================================================
95
- # دالة الاستدلال مع ZeroGPU
96
- # =========================================================
97
-
98
- @spaces.GPU(duration=60)
99
  def generate_response(
100
- text_input,
101
- image_input,
102
- temperature,
103
- top_p,
104
- max_new_tokens
105
  ):
106
  """
107
- معالجة النص والصور باستخدام MiniCPM-o-2_6
108
  """
109
-
110
- if not text_input and not image_input:
111
- return "Please provide text or image input."
112
 
113
  try:
114
- load_model()
115
- global model, tokenizer
116
 
117
- # إعداد المدخلات
 
 
 
 
118
  if image_input is not None:
119
- # معالجة الصورة + النص
120
- image = process_image(image_input)
121
-
122
- if not text_input:
123
- text_input = "What is shown in this image? Please describe in detail."
124
-
125
- # التحقق من وجود دالة chat في النموذج
126
- if hasattr(model, 'chat'):
127
- try:
128
- # استخدام دالة chat المخصصة
129
- msgs = [{"role": "user", "content": [image, text_input]}]
130
-
131
- with torch.no_grad():
132
- response = model.chat(
133
- image=image,
134
- msgs=msgs,
135
- tokenizer=tokenizer,
136
- sampling=True,
137
- temperature=temperature,
138
- top_p=top_p,
139
- max_new_tokens=max_new_tokens
140
- )
141
-
142
- return response
143
-
144
- except Exception as e:
145
- print(f"Chat method failed: {e}")
146
- # السقوط إلى الطريقة العادية
147
-
148
- # الطريقة البديلة للصور
149
- # دمج النص مع وصف الصورة
150
- prompt = f"Image: [Image will be processed]\n\nQuestion: {text_input}\n\nAnswer:"
151
-
152
- else:
153
- # نص فقط
154
- prompt = text_input
155
 
156
- # المعالجة العادية للنص
157
- inputs = tokenizer(
158
- prompt,
159
- return_tensors="pt",
160
- padding=True,
161
- truncation=True,
162
- max_length=2048
 
 
 
 
 
 
 
 
 
 
 
163
  )
164
 
165
- if torch.cuda.is_available():
166
- inputs = {k: v.cuda() for k, v in inputs.items() if v is not None}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
- # إعدادات التوليد
169
- gen_kwargs = {
170
- "max_new_tokens": max_new_tokens,
171
- "temperature": temperature if temperature > 0 else 1e-7,
172
- "top_p": top_p,
173
- "do_sample": temperature > 0,
174
- "pad_token_id": tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
175
- "eos_token_id": tokenizer.eos_token_id,
176
- }
177
 
178
  # التوليد
179
  with torch.no_grad():
180
- outputs = model.generate(**inputs, **gen_kwargs)
 
 
 
 
 
 
 
181
 
182
- # فك التشفير
183
- response = tokenizer.decode(
184
- outputs[0][inputs['input_ids'].shape[1]:],
185
  skip_special_tokens=True
186
- )
187
 
188
- return response.strip()
189
 
190
  except Exception as e:
191
- import traceback
192
- traceback.print_exc()
193
- return f"Error: {str(e)}"
194
 
195
 
196
- # =========================================================
197
- # دوال مساعدة للواجهة
198
- # =========================================================
199
-
200
- def clear_all():
201
- """مسح جميع المدخلات والمخرجات"""
202
- return "", None, ""
203
-
204
-
205
- def update_examples_visibility(show_examples):
206
- """تحديث رؤية الأمثلة"""
207
- return gr.update(visible=show_examples)
208
-
209
-
210
- # =========================================================
211
- # واجهة Gradio
212
- # =========================================================
213
-
214
- def create_demo():
215
- """إنشاء واجهة Gradio البسيطة"""
216
 
217
- with gr.Blocks(title="MiniCPM-o-2.6", css="""
218
- .gradio-container {
219
- max-width: 1200px;
220
- margin: auto;
221
- }
222
- h1 {
223
- text-align: center;
224
- }
225
- .contain {
226
- background: white;
227
- border-radius: 10px;
228
- padding: 20px;
229
- }
230
- """) as demo:
231
-
232
- gr.Markdown(
233
- """
234
- # 🤖 MiniCPM-o-2.6 - Multimodal AI Assistant
235
 
236
- <div style="text-align: center;">
237
- <p>
238
- <b>8B parameters model</b> with GPT-4 level performance<br>
239
- Supports: Text Generation, Image Understanding, OCR, and Multi-lingual conversations
240
- </p>
241
- </div>
242
- """
243
- )
244
-
245
- with gr.Row():
246
- # العمود الرئيسي
247
- with gr.Column(scale=2):
248
- with gr.Group():
249
- text_input = gr.Textbox(
250
- label="💭 Text Input",
251
- placeholder="Enter your question or prompt here...\nYou can ask about images, request text generation, or have a conversation.",
252
- lines=4,
253
- elem_id="text_input"
254
- )
255
-
256
- image_input = gr.Image(
257
- label="📷 Image Input (Optional)",
258
- type="pil",
259
- elem_id="image_input"
260
- )
261
-
262
- with gr.Row():
263
- submit_btn = gr.Button(
264
- "🚀 Generate Response",
265
- variant="primary",
266
- scale=2
267
- )
268
- clear_btn = gr.Button(
269
- "🗑️ Clear All",
270
- variant="secondary",
271
- scale=1
272
- )
273
-
274
- output = gr.Textbox(
275
- label="🤖 AI Response",
276
- lines=10,
277
- interactive=False,
278
- elem_id="output"
279
- )
280
 
281
- # عمود الإعدادات
282
- with gr.Column(scale=1):
283
- with gr.Group():
284
- gr.Markdown("### ⚙️ Generation Settings")
285
-
286
- temperature = gr.Slider(
287
- label="Temperature",
288
- minimum=0.0,
289
- maximum=1.5,
290
- value=0.7,
291
- step=0.1,
292
- info="Controls randomness (0=deterministic, 1.5=very creative)"
293
- )
294
-
295
- top_p = gr.Slider(
296
- label="Top-p (Nucleus Sampling)",
297
- minimum=0.1,
298
- maximum=1.0,
299
- value=0.9,
300
- step=0.05,
301
- info="Controls diversity of output"
302
- )
303
-
304
- max_new_tokens = gr.Slider(
305
- label="Max New Tokens",
306
- minimum=50,
307
- maximum=2048,
308
- value=512,
309
- step=50,
310
- info="Maximum length of generated response"
311
- )
312
 
313
- gr.Markdown(
314
- """
315
- ### 📚 Quick Tips:
316
-
317
- **Text Generation:**
318
- - Ask questions
319
- - Request explanations
320
- - Generate creative content
321
-
322
- **Image Understanding:**
323
- - Upload an image
324
- - Ask about contents
325
- - Request OCR/text extraction
326
- - Get detailed descriptions
327
-
328
- **Languages:**
329
- - English, Chinese, Arabic
330
- - And many more!
331
- """
332
  )
 
 
 
333
 
334
- # أمثلة
335
- with gr.Group():
336
- gr.Markdown("### 💡 Example Prompts")
337
- gr.Examples(
338
- examples=[
339
- ["Explain quantum computing in simple terms for a beginner.", None],
340
- ["Write a short story about a robot learning to paint.", None],
341
- ["What are the main differences between Python and JavaScript?", None],
342
- ["Create a healthy meal plan for one week.", None],
343
- ["Translate 'Hello, how are you?' to French, Spanish, and Arabic.", None],
344
- ],
345
- inputs=[text_input, image_input],
346
- outputs=output,
347
- fn=lambda t, i: generate_response(t, i, 0.7, 0.9, 512),
348
- cache_examples=False,
349
- label="Click any example to try it"
350
  )
351
-
352
- # ربط الأحداث
353
- submit_btn.click(
354
- fn=generate_response,
355
- inputs=[text_input, image_input, temperature, top_p, max_new_tokens],
356
- outputs=output,
357
- api_name="generate"
358
- )
359
-
360
- text_input.submit(
361
- fn=generate_response,
362
- inputs=[text_input, image_input, temperature, top_p, max_new_tokens],
363
- outputs=output
364
- )
365
-
366
- clear_btn.click(
367
- fn=clear_all,
368
- inputs=[],
369
- outputs=[text_input, image_input, output]
370
- )
371
-
372
- # رسالة ترحيبية عند التحميل
373
- demo.load(
374
- lambda: gr.Info("Model is loading... This may take a moment on first use."),
375
- inputs=None,
376
- outputs=None
377
- )
378
 
379
- return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
 
381
 
382
- # =========================================================
383
  # تشغيل التطبيق
384
- # =========================================================
385
-
386
  if __name__ == "__main__":
387
- demo = create_demo()
388
  demo.launch(
389
- ssr_mode=False,
390
  show_error=True,
391
- share=False
392
- )
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
  import spaces
4
  from PIL import Image
5
+ import numpy as np
6
+ import os
7
+ import tempfile
 
 
 
 
8
 
9
+ # استيراد المكتبات الضرورية من Uni-MoE
10
+ try:
11
+ from uni_moe.model.processing_qwen2_vl import Qwen2VLProcessor
12
+ from uni_moe.model.modeling_out import GrinQwen2VLOutForConditionalGeneration
13
+ from uni_moe.qwen_vl_utils import process_mm_info
14
+ from uni_moe.model import deepspeed_moe_inference_utils
15
+ except ImportError:
16
+ print("⚠️ Warning: Uni-MoE libraries not fully imported. Some features may not work.")
17
 
18
+ # تحميل النموذج
19
+ MODEL_NAME = "HIT-TMG/Uni-MoE-2.0-Omni"
20
+ device = "cuda" if torch.cuda.is_available() else "cpu"
21
 
22
+ print(f"🚀 Loading model: {MODEL_NAME}")
23
+ print(f"📍 Device: {device}")
24
 
25
+ # تحميل المعالج والنموذج
26
+ try:
27
+ processor = Qwen2VLProcessor.from_pretrained(MODEL_NAME)
28
+ model = GrinQwen2VLOutForConditionalGeneration.from_pretrained(
29
+ MODEL_NAME,
30
+ torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
31
+ device_map="auto"
32
+ )
33
+ if device == "cuda":
34
+ model = model.cuda()
35
 
36
+ # تعيين data_args
37
+ processor.data_args = model.config
38
+ print("✅ Model loaded successfully!")
39
+ except Exception as e:
40
+ print(f"❌ Error loading model: {str(e)}")
41
+ processor = None
42
+ model = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ @spaces.GPU(duration=120) # استخدام ZeroGPU لمدة 120 ثانية
 
 
 
 
 
46
  def generate_response(
47
+ text_input: str,
48
+ image_input: Image.Image = None,
49
+ audio_input: str = None,
50
+ temperature: float = 1.0,
51
+ max_new_tokens: int = 512
52
  ):
53
  """
54
+ توليد استجابة من النموذج بناءً على المدخلات المختلفة
55
  """
56
+ if model is None or processor is None:
57
+ return "❌ النموذج غير متاح حالياً. يرجى المحاولة لاحقاً."
 
58
 
59
  try:
60
+ # بناء رسالة المستخدم
61
+ content = []
62
 
63
+ # إضافة النص
64
+ if text_input:
65
+ content.append({"type": "text", "text": text_input})
66
+
67
+ # إضافة الصورة
68
  if image_input is not None:
69
+ # حفظ الصورة مؤقتاً
70
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp_img:
71
+ image_input.save(tmp_img.name)
72
+ content.append({"type": "image", "image": tmp_img.name})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ # إضافة الصوت
75
+ if audio_input is not None:
76
+ content.append({"type": "audio", "audio": audio_input})
77
+
78
+ if not content:
79
+ return "⚠️ يرجى إدخال نص أو صورة أو صوت."
80
+
81
+ # بناء الرسائل
82
+ messages = [{
83
+ "role": "user",
84
+ "content": content
85
+ }]
86
+
87
+ # معالجة الرسائل
88
+ texts = processor.apply_chat_template(
89
+ messages,
90
+ tokenize=False,
91
+ add_generation_prompt=True
92
  )
93
 
94
+ # استبدال العلامات الخاصة
95
+ texts = texts.replace(
96
+ "<image>", "<|vision_start|><|image_pad|><|vision_end|>"
97
+ ).replace(
98
+ "<audio>", "<|audio_start|><|audio_pad|><|audio_end|>"
99
+ ).replace(
100
+ "<video>", "<|vision_start|><|video_pad|><|vision_end|>"
101
+ )
102
+
103
+ # معالجة الوسائط
104
+ image_inputs, video_inputs, audio_inputs = process_mm_info(messages)
105
+
106
+ # تجهيز المدخلات
107
+ inputs = processor(
108
+ text=texts,
109
+ images=image_inputs,
110
+ videos=video_inputs,
111
+ audios=audio_inputs,
112
+ padding=True,
113
+ return_tensors="pt",
114
+ )
115
 
116
+ inputs["input_ids"] = inputs["input_ids"].unsqueeze(0)
117
+ inputs = inputs.to(device=model.device)
 
 
 
 
 
 
 
118
 
119
  # التوليد
120
  with torch.no_grad():
121
+ output_ids = model.generate(
122
+ **inputs,
123
+ use_cache=True,
124
+ pad_token_id=processor.tokenizer.eos_token_id,
125
+ max_new_tokens=max_new_tokens,
126
+ temperature=temperature,
127
+ do_sample=True
128
+ )
129
 
130
+ # فك تشفير النتي��ة
131
+ response = processor.batch_decode(
132
+ output_ids[:, inputs["input_ids"].shape[-1]:],
133
  skip_special_tokens=True
134
+ )[0]
135
 
136
+ return response
137
 
138
  except Exception as e:
139
+ return f"❌ حدث خطأ: {str(e)}"
 
 
140
 
141
 
142
+ # إنشاء واجهة Gradio
143
+ with gr.Blocks(
144
+ title="Uni-MoE 2.0 Omni Demo",
145
+ theme=gr.themes.Soft(),
146
+ css="""
147
+ .rtl { direction: rtl; text-align: right; }
148
+ .main-header { text-align: center; margin-bottom: 2rem; }
149
+ """
150
+ ) as demo:
 
 
 
 
 
 
 
 
 
 
 
151
 
152
+ gr.Markdown("""
153
+ <div class="main-header">
154
+
155
+ # 🚀 Uni-MoE 2.0 Omni Demo
156
+
157
+ نموذج متعدد الوسائط متقدم يدعم فهم وتوليد **النصوص والصور والصوت**
158
+
159
+ An advanced omnimodal model supporting understanding and generation of **text, images, and audio**
160
+
161
+ </div>
162
+ """)
163
+
164
+ with gr.Row():
165
+ with gr.Column(scale=1):
166
+ gr.Markdown("### 📝 المدخلات / Inputs")
 
 
 
167
 
168
+ text_input = gr.Textbox(
169
+ label="النص / Text",
170
+ placeholder="اكتب سؤالك أو وصفك هنا... / Enter your question or description here...",
171
+ lines=3,
172
+ rtl=True
173
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
+ image_input = gr.Image(
176
+ label="الصورة (اختياري) / Image (Optional)",
177
+ type="pil"
178
+ )
179
+
180
+ audio_input = gr.Audio(
181
+ label="الصوت (اختياري) / Audio (Optional)",
182
+ type="filepath"
183
+ )
184
+
185
+ with gr.Accordion("⚙️ إعدادات متقدمة / Advanced Settings", open=False):
186
+ temperature = gr.Slider(
187
+ minimum=0.1,
188
+ maximum=2.0,
189
+ value=1.0,
190
+ step=0.1,
191
+ label="Temperature"
192
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
+ max_tokens = gr.Slider(
195
+ minimum=64,
196
+ maximum=2048,
197
+ value=512,
198
+ step=64,
199
+ label="Max New Tokens"
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  )
201
+
202
+ submit_btn = gr.Button("🎯 توليد / Generate", variant="primary")
203
+ clear_btn = gr.Button("🗑️ مسح / Clear")
204
 
205
+ with gr.Column(scale=1):
206
+ gr.Markdown("### 💬 النتيجة / Output")
207
+
208
+ output = gr.Textbox(
209
+ label="الاستجابة / Response",
210
+ lines=15,
211
+ show_copy_button=True,
212
+ rtl=True
 
 
 
 
 
 
 
 
213
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
+ # أمثلة
216
+ gr.Markdown("### 📚 أمثلة / Examples")
217
+ gr.Examples(
218
+ examples=[
219
+ ["ما هي عاصمة مصر؟", None, None],
220
+ ["صف هذه الصورة بالتفصيل", "https://picsum.photos/400/300", None],
221
+ ["What is the capital of France?", None, None],
222
+ ["Describe this image in detail", "https://picsum.photos/400/300", None],
223
+ ],
224
+ inputs=[text_input, image_input, audio_input],
225
+ )
226
+
227
+ # معلومات إضافية
228
+ gr.Markdown("""
229
+ ---
230
+ ### ℹ️ معلومات / Information
231
+
232
+ **Uni-MoE 2.0 Omni** هو نموذج لغوي متعدد الوسائط (Omnimodal) مبني على معماريات:
233
+ - 🧠 **Mixture-of-Experts (MoE)** لكفاءة الحوسبة
234
+ - 🔄 **Qwen2.5-7B** كقاعدة أساسية
235
+ - 🎯 **Omni-Modality 3D RoPE** لمحاذاة متعددة الوسائط
236
+
237
+ **القدرات:**
238
+ - ✅ فهم النصوص والصور والصوت والفيديو
239
+ - ✅ توليد النصوص والصور والصوت
240
+ - ✅ استدلال متعدد الوسائط
241
+
242
+ 📄 **ورقة بحثية:** [arXiv:2511.12609](https://arxiv.org/abs/2511.12609)
243
+
244
+ 🔗 **GitHub:** [HITsz-TMG/Uni-MoE](https://github.com/HITsz-TMG/Uni-MoE)
245
+
246
+ ---
247
+ <p style="text-align: center; color: #666;">
248
+ تم إنشاؤه باستخدام Gradio و ZeroGPU 🚀
249
+ </p>
250
+ """)
251
+
252
+ # ربط الأحداث
253
+ submit_btn.click(
254
+ fn=generate_response,
255
+ inputs=[text_input, image_input, audio_input, temperature, max_tokens],
256
+ outputs=output
257
+ )
258
+
259
+ clear_btn.click(
260
+ fn=lambda: (None, None, None, None),
261
+ outputs=[text_input, image_input, audio_input, output]
262
+ )
263
 
264
 
 
265
  # تشغيل التطبيق
 
 
266
  if __name__ == "__main__":
267
+ demo.queue(max_size=10)
268
  demo.launch(
269
+ share=False,
270
  show_error=True,
271
+ server_name="0.0.0.0",
272
+ server_port=7860
273
+ )