Spaces:

prithivMLmods
/

VisionScope-R2

Running on Zero

App Files Files Community

prithivMLmods commited on Jun 6

Commit

45691d2

verified ·

1 Parent(s): b1936b2

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -55

app.py CHANGED Viewed

@@ -62,10 +62,6 @@ model_k = Qwen2VLForConditionalGeneration.from_pretrained(
 ).to(device).eval()
 def downsample_video(video_path):
-    """
-    Downsamples the video to evenly spaced frames.
-    Each frame is returned as a PIL image along with its timestamp.
-    """
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
@@ -84,16 +80,14 @@ def downsample_video(video_path):
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
-                   max_new_tokens: int = 1024,
-                   temperature: float = 0.6,
-                   top_p: float = 0.9,
-                   top_k: int = 50,
-                   repetition_penalty: float = 1.2):
-    """
-    Generates responses using the selected model for image input.
-    """
     if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
         processor = processor_m
         model = model_m
     elif model_name == "SpaceThinker-3B":
         processor = processor_z
@@ -109,23 +103,43 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         yield "Please upload an image."
         return
-    messages = [{
-        "role": "user",
-        "content": [
-            {"type": "image", "image": image},
-            {"type": "text", "text": text},
-        ]
-    }]
-    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor(
-        text=[prompt_full],
-        images=[image],
-        return_tensors="pt",
-        padding=True,
-        truncation=False,
-        max_length=MAX_INPUT_TOKEN_LENGTH
-    ).to(device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
@@ -138,16 +152,14 @@ def generate_image(model_name: str, text: str, image: Image.Image,
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
-                   max_new_tokens: int = 1024,
-                   temperature: float = 0.6,
-                   top_p: float = 0.9,
-                   top_k: int = 50,
-                   repetition_penalty: float = 1.2):
-    """
-    Generates responses using the selected model for video input.
-    """
     if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
         processor = processor_m
         model = model_m
     elif model_name == "SpaceThinker-3B":
         processor = processor_z
@@ -164,24 +176,47 @@ def generate_video(model_name: str, text: str, video_path: str,
         return
     frames = downsample_video(video_path)
-    messages = [
-        {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-        {"role": "user", "content": [{"type": "text", "text": text}]}
-    ]
-    for frame in frames:
-        image, timestamp = frame
-        messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
-        messages[1]["content"].append({"type": "image", "image": image})
-    inputs = processor.apply_chat_template(
-        messages,
-        tokenize=True,
-        add_generation_prompt=True,
-        return_dict=True,
-        return_tensors="pt",
-        truncation=False,
-        max_length=MAX_INPUT_TOKEN_LENGTH
-    ).to(device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
         "streamer": streamer,

 ).to(device).eval()
 def downsample_video(video_path):
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
+                  max_new_tokens: int = 1024,
+                  temperature: float = 0.6,
+                  top_p: float = 0.9,
+                  top_k: int = 50,
+                  repetition_penalty: float = 1.2):
     if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
         processor = processor_m
+        tokenizer = tokenizer_m
         model = model_m
     elif model_name == "SpaceThinker-3B":
         processor = processor_z
         yield "Please upload an image."
         return
+    # For Llama-3.1-Nemotron-Nano-VL-8B-V1, manually construct prompt and tokenize
+    if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
+        # Construct a simple prompt since apply_chat_template is not available
+        prompt_full = f"<|image|>{text}<|endoftext|>"
+        inputs = tokenizer(
+            prompt_full,
+            return_tensors="pt",
+            padding=True,
+            truncation=False,
+            max_length=MAX_INPUT_TOKEN_LENGTH
+        ).to(device)
+        # Process image separately
+        image_inputs = processor(image, return_tensors="pt").to(device)
+        inputs.update(image_inputs)
+    else:
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": text},
+            ]
+        }]
+        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(
+            text=[prompt_full],
+            images=[image],
+            return_tensors="pt",
+            padding=True,
+            truncation=False,
+            max_length=MAX_INPUT_TOKEN_LENGTH
+        ).to(device)
+    streamer = TextIteratorStreamer(
+        tokenizer if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1" else processor,
+        skip_prompt=True,
+        skip_special_tokens=True
+    )
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
+                  max_new_tokens: int = 1024,
+                  temperature: float = 0.6,
+                  top_p: float = 0.9,
+                  top_k: int = 50,
+                  repetition_penalty: float = 1.2):
     if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
         processor = processor_m
+        tokenizer = tokenizer_m
         model = model_m
     elif model_name == "SpaceThinker-3B":
         processor = processor_z
         return
     frames = downsample_video(video_path)
+    if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
+        # Construct a simple prompt for Llama-3.1-Nemotron-Nano-VL-8B-V1
+        prompt_parts = ["<|startoftext|>You are a helpful assistant.<|endoftext|>", text]
+        for frame in frames:
+            image, timestamp = frame
+            prompt_parts.append(f"Frame {timestamp}: <|image|>")
+        prompt_full = " ".join(prompt_parts) + "<|endoftext|>"
+        inputs = tokenizer(
+            prompt_full,
+            return_tensors="pt",
+            padding=True,
+            truncation=False,
+            max_length=MAX_INPUT_TOKEN_LENGTH
+        ).to(device)
+        # Process all frames
+        image_inputs = processor([frame[0] for frame in frames], return_tensors="pt").to(device)
+        inputs.update(image_inputs)
+    else:
+        messages = [
+            {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+            {"role": "user", "content": [{"type": "text", "text": text}]}
+        ]
+        for frame in frames:
+            image, timestamp = frame
+            messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
+            messages[1]["content"].append({"type": "image", "image": image})
+        inputs = processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt",
+            truncation=False,
+            max_length=MAX_INPUT_TOKEN_LENGTH
+        ).to(device)
+    streamer = TextIteratorStreamer(
+        tokenizer if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1" else processor,
+        skip_prompt=True,
+        skip_special_tokens=True
+    )
     generation_kwargs = {
         **inputs,
         "streamer": streamer,