Spaces:

prithivMLmods
/

VisionScope-R2

Running on Zero

App Files Files Community

prithivMLmods commited on Jun 4

Commit

926d2ec

verified ·

1 Parent(s): f341af1

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -14

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import random
 import uuid
 import json
 import time
 import asyncio
 from threading import Thread
@@ -19,11 +20,9 @@ from transformers import (
     AutoTokenizer,
     TextIteratorStreamer,
 )
-from transformers.image_utils import load_image
-import subprocess
-subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
@@ -32,8 +31,8 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load typhoon
-MODEL_ID_M = "Qwen/Qwen2.5-VL-3B-Instruct"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_M,
@@ -42,7 +41,7 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 ).to(device).eval()
 # Load Space Thinker
-MODEL_ID_Z = "One-RL-to-See-Them-All/Orsta-32B-0326"
 processor_z = AutoProcessor.from_pretrained(MODEL_ID_Z, trust_remote_code=True)
 model_z = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_Z,
@@ -50,7 +49,14 @@ model_z = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
 def downsample_video(video_path):
     """
@@ -83,12 +89,15 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     """
     Generates responses using the selected model for image input.
     """
-    if model_name == "Qwen2.5-VL-3B":
         processor = processor_m
         model = model_m
-    elif model_name == "Orsta-32B-0326":
         processor = processor_z
         model = model_z
     else:
         yield "Invalid model selected."
         return
@@ -133,12 +142,15 @@ def generate_video(model_name: str, text: str, video_path: str,
     """
     Generates responses using the selected model for video input.
     """
-    if model_name == "Qwen2.5-VL-3B":
         processor = processor_m
         model = model_m
-    elif model_name == "Orsta-32B-0326":
         processor = processor_z
         model = model_z
     else:
         yield "Invalid model selected."
         return
@@ -239,9 +251,9 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
         with gr.Column():
             output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
             model_choice = gr.Radio(
-                choices=["Qwen2.5-VL-3B", "Orsta-32B-0326"],
                 label="Select Model",
-                value="Orsta-32B-0326"
             )
     image_submit.click(

 import random
 import uuid
 import json
+import requests
 import time
 import asyncio
 from threading import Thread
     AutoTokenizer,
     TextIteratorStreamer,
 )
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
+from transformers.image_utils import load_image
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 2048
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load SkyCaptioner-V1
+MODEL_ID_M = "Skywork/SkyCaptioner-V1"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_M,
 ).to(device).eval()
 # Load Space Thinker
+MODEL_ID_Z = "remyxai/SpaceThinker-Qwen2.5VL-3B"
 processor_z = AutoProcessor.from_pretrained(MODEL_ID_Z, trust_remote_code=True)
 model_z = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_Z,
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load blip2-opt-2.7b
+MODEL_ID_K = "Salesforce/blip2-opt-2.7b"
+processor_k = Blip2Processor.from_pretrained(MODEL_ID_K, trust_remote_code=True)
+model_k = Blip2ForConditionalGeneration.from_pretrained(
+    MODEL_ID_K,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
 def downsample_video(video_path):
     """
     """
     Generates responses using the selected model for image input.
     """
+    if model_name == "SkyCaptioner-V1":
         processor = processor_m
         model = model_m
+    elif model_name == "SpaceThinker-3B":
         processor = processor_z
         model = model_z
+    elif model_name == "blip2-opt-2.7b":
+        processor = processor_k
+        model = model_k
     else:
         yield "Invalid model selected."
         return
     """
     Generates responses using the selected model for video input.
     """
+    if model_name == "SkyCaptioner-V1":
         processor = processor_m
         model = model_m
+    elif model_name == "SpaceThinker-3B":
         processor = processor_z
         model = model_z
+    elif model_name == "blip2-opt-2.7b":
+        processor = processor_k
+        model = model_k
     else:
         yield "Invalid model selected."
         return
         with gr.Column():
             output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
             model_choice = gr.Radio(
+                choices=["SkyCaptioner-V1", "SpaceThinker-3B", "blip2-opt-2.7b"],
                 label="Select Model",
+                value="SkyCaptioner-V1"
             )
     image_submit.click(