Spaces:

BiasLab2025
/

Demo-2025

Sleeping

App Files Files Community

zye0616 commited on 7 days ago

Commit

c57c49d

1 Parent(s): a8d3381

updated object detector

Browse files

Files changed (7) hide show

README.md +1 -1
app.py +18 -1
inference.py +19 -8
models/detectors/base.py +2 -1
models/detectors/yolov8.py +69 -0
models/model_loader.py +2 -0
requirements.txt +2 -0

README.md CHANGED Viewed

@@ -12,7 +12,7 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
 ## Mission-guided detections
-1. Send a `POST /process_video` request with fields `video` (file) and `prompt` (mission text).
 2. The backend feeds the mission text into an OpenAI (`gpt-4o-mini`) reasoning step that scores and ranks every YOLO/COCO class. Place your API key inside `.env` as either `OPENAI_API_KEY=...` or `OpenAI-API: ...`; the server loads it automatically on startup.
 3. The top scored classes become the textual queries for the existing OWLv2 detector so the detections align with the mission.
 4. After object detection finishes, another OpenAI call ingests the detection log plus the first/middle/last frame context and produces a natural-language summary of the mission outcome.

 ## Mission-guided detections
+1. Send a `POST /process_video` request with fields `video` (file) and `prompt` (mission text). Optionally include `detector` (`owlv2` or `hf_yolov8`) to pick the backend per request; if omitted the server uses its default/`OBJECT_DETECTOR` setting.
 2. The backend feeds the mission text into an OpenAI (`gpt-4o-mini`) reasoning step that scores and ranks every YOLO/COCO class. Place your API key inside `.env` as either `OPENAI_API_KEY=...` or `OpenAI-API: ...`; the server loads it automatically on startup.
 3. The top scored classes become the textual queries for the existing OWLv2 detector so the detections align with the mission.
 4. After object detection finishes, another OpenAI call ingests the detection log plus the first/middle/last frame context and produces a natural-language summary of the mission outcome.

app.py CHANGED Viewed

@@ -2,8 +2,10 @@ import logging
 import os
 import tempfile
 from pathlib import Path
 from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.responses import FileResponse, JSONResponse
 import uvicorn
@@ -12,6 +14,14 @@ from inference import run_inference
 logging.basicConfig(level=logging.INFO)
 app = FastAPI(title="Video Processing Backend")
 def _save_upload_to_tmp(upload: UploadFile) -> str:
@@ -45,6 +55,7 @@ async def process_video(
     background_tasks: BackgroundTasks,
     video: UploadFile = File(...),
     prompt: str = Form(...),
 ):
     if video is None:
         raise HTTPException(status_code=400, detail="Video file is required.")
@@ -63,7 +74,13 @@ async def process_video(
     os.close(fd)
     try:
-        output_path, mission_plan, mission_summary = run_inference(input_path, output_path, prompt, max_frames=10)
     except ValueError as exc:
         logging.exception("Video decoding failed.")
         _safe_delete(input_path)

 import os
 import tempfile
 from pathlib import Path
+from typing import Optional
 from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse, JSONResponse
 import uvicorn
 logging.basicConfig(level=logging.INFO)
 app = FastAPI(title="Video Processing Backend")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+    expose_headers=["x-mission-summary"],
+)
 def _save_upload_to_tmp(upload: UploadFile) -> str:
     background_tasks: BackgroundTasks,
     video: UploadFile = File(...),
     prompt: str = Form(...),
+    detector: Optional[str] = Form(None),
 ):
     if video is None:
         raise HTTPException(status_code=400, detail="Video file is required.")
     os.close(fd)
     try:
+        output_path, mission_plan, mission_summary = run_inference(
+            input_path,
+            output_path,
+            prompt,
+            max_frames=10,
+            detector_name=detector,
+        )
     except ValueError as exc:
         logging.exception("Video decoding failed.")
         _safe_delete(input_path)

inference.py CHANGED Viewed

@@ -24,14 +24,18 @@ def _build_detection_records(
     scores: Sequence[float],
     labels: Sequence[int],
     queries: Sequence[str],
 ) -> List[Dict[str, Any]]:
     detections: List[Dict[str, Any]] = []
     for idx, box in enumerate(boxes):
-        label_idx = int(labels[idx]) if idx < len(labels) else -1
-        if 0 <= label_idx < len(queries):
-            label = queries[label_idx]
         else:
-            label = f"label_{label_idx}"
         detections.append(
             {
                 "label": label,
@@ -42,12 +46,18 @@ def _build_detection_records(
     return detections
-def infer_frame(frame: np.ndarray, queries: Sequence[str]) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
-    detector = load_detector()
     text_queries = list(queries) or ["object"]
     try:
         result = detector.predict(frame, text_queries)
-        detections = _build_detection_records(result.boxes, result.scores, result.labels, text_queries)
     except Exception:
         logging.exception("Inference failed for queries %s", text_queries)
         raise
@@ -59,6 +69,7 @@ def run_inference(
     output_video_path: str,
     mission_prompt: str,
     max_frames: Optional[int] = None,
 ) -> Tuple[str, MissionPlan, str]:
     try:
         frames, fps, width, height = extract_frames(input_video_path)
@@ -76,7 +87,7 @@ def run_inference(
         if max_frames is not None and idx >= max_frames:
             break
         logging.debug("Processing frame %d", idx)
-        processed_frame, detections = infer_frame(frame, queries)
         detection_log.append({"frame_index": idx, "detections": detections})
         processed_frames.append(processed_frame)

     scores: Sequence[float],
     labels: Sequence[int],
     queries: Sequence[str],
+    label_names: Optional[Sequence[str]] = None,
 ) -> List[Dict[str, Any]]:
     detections: List[Dict[str, Any]] = []
     for idx, box in enumerate(boxes):
+        if label_names is not None and idx < len(label_names):
+            label = label_names[idx]
         else:
+            label_idx = int(labels[idx]) if idx < len(labels) else -1
+            if 0 <= label_idx < len(queries):
+                label = queries[label_idx]
+            else:
+                label = f"label_{label_idx}"
         detections.append(
             {
                 "label": label,
     return detections
+def infer_frame(
+    frame: np.ndarray,
+    queries: Sequence[str],
+    detector_name: Optional[str] = None,
+) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
+    detector = load_detector(detector_name)
     text_queries = list(queries) or ["object"]
     try:
         result = detector.predict(frame, text_queries)
+        detections = _build_detection_records(
+            result.boxes, result.scores, result.labels, text_queries, result.label_names
+        )
     except Exception:
         logging.exception("Inference failed for queries %s", text_queries)
         raise
     output_video_path: str,
     mission_prompt: str,
     max_frames: Optional[int] = None,
+    detector_name: Optional[str] = None,
 ) -> Tuple[str, MissionPlan, str]:
     try:
         frames, fps, width, height = extract_frames(input_video_path)
         if max_frames is not None and idx >= max_frames:
             break
         logging.debug("Processing frame %d", idx)
+        processed_frame, detections = infer_frame(frame, queries, detector_name=detector_name)
         detection_log.append({"frame_index": idx, "detections": detections})
         processed_frames.append(processed_frame)

models/detectors/base.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import NamedTuple, Sequence
 import numpy as np
@@ -7,6 +7,7 @@ class DetectionResult(NamedTuple):
     boxes: np.ndarray
     scores: Sequence[float]
     labels: Sequence[int]
 class ObjectDetector:

+from typing import NamedTuple, Optional, Sequence
 import numpy as np
     boxes: np.ndarray
     scores: Sequence[float]
     labels: Sequence[int]
+    label_names: Optional[Sequence[str]] = None
 class ObjectDetector:

models/detectors/yolov8.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import logging
+from typing import List, Sequence
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+from ultralytics import YOLO
+from models.detectors.base import DetectionResult, ObjectDetector
+class HuggingFaceYoloV8Detector(ObjectDetector):
+    """YOLOv8 detector whose weights are fetched from the Hugging Face Hub."""
+    REPO_ID = "Ultralytics/YOLOv8"
+    WEIGHT_FILE = "yolov8s.pt"
+    def __init__(self, score_threshold: float = 0.3) -> None:
+        self.name = "hf_yolov8"
+        self.score_threshold = score_threshold
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        logging.info(
+            "Loading Hugging Face YOLOv8 weights %s/%s onto %s",
+            self.REPO_ID,
+            self.WEIGHT_FILE,
+            self.device,
+        )
+        weight_path = hf_hub_download(repo_id=self.REPO_ID, filename=self.WEIGHT_FILE)
+        self.model = YOLO(weight_path)
+        self.model.to(self.device)
+        self.class_names = self.model.names
+    def _filter_indices(self, label_names: Sequence[str], queries: Sequence[str]) -> List[int]:
+        if not queries:
+            return list(range(len(label_names)))
+        allowed = {query.lower().strip() for query in queries if query}
+        keep = [idx for idx, name in enumerate(label_names) if name.lower() in allowed]
+        return keep or list(range(len(label_names)))
+    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        device_arg = 0 if self.device.startswith("cuda") else "cpu"
+        results = self.model.predict(
+            source=frame,
+            device=device_arg,
+            conf=self.score_threshold,
+            verbose=False,
+        )
+        result = results[0]
+        boxes = result.boxes
+        if boxes is None or boxes.xyxy is None:
+            empty = np.empty((0, 4), dtype=np.float32)
+            return DetectionResult(empty, [], [], [])
+        xyxy = boxes.xyxy.cpu().numpy()
+        scores = boxes.conf.cpu().numpy().tolist()
+        label_ids = boxes.cls.cpu().numpy().astype(int).tolist()
+        label_names = [self.class_names.get(idx, f"class_{idx}") for idx in label_ids]
+        keep_indices = self._filter_indices(label_names, queries)
+        xyxy = xyxy[keep_indices] if len(xyxy) else xyxy
+        scores = [scores[i] for i in keep_indices]
+        label_ids = [label_ids[i] for i in keep_indices]
+        label_names = [label_names[i] for i in keep_indices]
+        return DetectionResult(
+            boxes=xyxy,
+            scores=scores,
+            labels=label_ids,
+            label_names=label_names,
+        )

models/model_loader.py CHANGED Viewed

@@ -4,11 +4,13 @@ from typing import Callable, Dict, Optional
 from models.detectors.base import ObjectDetector
 from models.detectors.owlv2 import Owlv2Detector
 DEFAULT_DETECTOR = "owlv2"
 _REGISTRY: Dict[str, Callable[[], ObjectDetector]] = {
     "owlv2": Owlv2Detector,
 }

 from models.detectors.base import ObjectDetector
 from models.detectors.owlv2 import Owlv2Detector
+from models.detectors.yolov8 import HuggingFaceYoloV8Detector
 DEFAULT_DETECTOR = "owlv2"
 _REGISTRY: Dict[str, Callable[[], ObjectDetector]] = {
     "owlv2": Owlv2Detector,
+    "hf_yolov8": HuggingFaceYoloV8Detector,
 }

requirements.txt CHANGED Viewed

@@ -8,3 +8,5 @@ accelerate
 pillow
 scipy
 openai

 pillow
 scipy
 openai
+huggingface-hub
+ultralytics