Spaces:

BiasLab2025
/

Demo-2025

Sleeping

App Files Files Community

zye0616 commited on 7 days ago

Commit

a8d3381

1 Parent(s): 788bb37

mission detection with summary

Browse files

Files changed (11) hide show

.gitignore +1 -0
README.md +8 -0
app.py +5 -2
inference.py +45 -29
mission_planner.py +211 -0
mission_summarizer.py +89 -0
models/detectors/base.py +18 -0
models/detectors/owlv2.py +56 -0
models/model_loader.py +32 -15
requirements.txt +1 -0
utils/openai_client.py +67 -0

.gitignore CHANGED Viewed

@@ -4,3 +4,4 @@ __pycache__/
 *.log
 *.tmp
 .DS_Store

 *.log
 *.tmp
 .DS_Store
+.env

README.md CHANGED Viewed

@@ -9,3 +9,11 @@ license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+## Mission-guided detections
+1. Send a `POST /process_video` request with fields `video` (file) and `prompt` (mission text).
+2. The backend feeds the mission text into an OpenAI (`gpt-4o-mini`) reasoning step that scores and ranks every YOLO/COCO class. Place your API key inside `.env` as either `OPENAI_API_KEY=...` or `OpenAI-API: ...`; the server loads it automatically on startup.
+3. The top scored classes become the textual queries for the existing OWLv2 detector so the detections align with the mission.
+4. After object detection finishes, another OpenAI call ingests the detection log plus the first/middle/last frame context and produces a natural-language summary of the mission outcome.
+5. The HTTP response still streams the processed video, and it now embeds the structured mission plan (`x-mission-plan`) and text summary (`x-mission-summary`) in the headers.

app.py CHANGED Viewed

@@ -63,7 +63,7 @@ async def process_video(
     os.close(fd)
     try:
-        run_inference(input_path, output_path, prompt, max_frames=10)
     except ValueError as exc:
         logging.exception("Video decoding failed.")
         _safe_delete(input_path)
@@ -78,11 +78,14 @@ async def process_video(
     _schedule_cleanup(background_tasks, input_path)
     _schedule_cleanup(background_tasks, output_path)
-    return FileResponse(
         path=output_path,
         media_type="video/mp4",
         filename="processed.mp4",
     )
 if __name__ == "__main__":

     os.close(fd)
     try:
+        output_path, mission_plan, mission_summary = run_inference(input_path, output_path, prompt, max_frames=10)
     except ValueError as exc:
         logging.exception("Video decoding failed.")
         _safe_delete(input_path)
     _schedule_cleanup(background_tasks, input_path)
     _schedule_cleanup(background_tasks, output_path)
+    response = FileResponse(
         path=output_path,
         media_type="video/mp4",
         filename="processed.mp4",
     )
+    response.headers["x-mission-plan"] = mission_plan.to_json()
+    response.headers["x-mission-summary"] = mission_summary.replace("\n", " ").strip()
+    return response
 if __name__ == "__main__":

inference.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import logging
-from typing import List, Optional
 import cv2
 import numpy as np
-import torch
-from models.model_loader import load_model
 from utils.video import extract_frames, write_video
@@ -19,51 +19,67 @@ def draw_boxes(frame: np.ndarray, boxes: np.ndarray) -> np.ndarray:
     return output
-def infer_frame(frame: np.ndarray, prompt: str) -> np.ndarray:
-    processor, model, device = load_model()
-    try:
-        inputs = processor(text=[prompt], images=frame, return_tensors="pt")
-        if hasattr(inputs, "to"):
-            inputs = inputs.to(device)
         else:
-            inputs = {k: v.to(device) if hasattr(v, "to") else v for k, v in inputs.items()}
-        with torch.no_grad():
-            outputs = model(**inputs)
-        results = processor.post_process_object_detection(
-            outputs,
-            threshold=0.3,
-            target_sizes=[frame.shape[:2]],
-        )[0]
-        boxes = results["boxes"]
-        if hasattr(boxes, "cpu"):
-            boxes_np = boxes.cpu().numpy()
-        else:
-            boxes_np = np.asarray(boxes)
     except Exception:
-        logging.exception("Inference failed for prompt '%s'", prompt)
         raise
-    return draw_boxes(frame, boxes_np)
 def run_inference(
     input_video_path: str,
     output_video_path: str,
-    prompt: str,
     max_frames: Optional[int] = None,
-) -> str:
     try:
         frames, fps, width, height = extract_frames(input_video_path)
     except ValueError as exc:
         logging.exception("Failed to decode video at %s", input_video_path)
         raise
     processed_frames: List[np.ndarray] = []
     for idx, frame in enumerate(frames):
         if max_frames is not None and idx >= max_frames:
             break
         logging.debug("Processing frame %d", idx)
-        processed_frame = infer_frame(frame, prompt)
         processed_frames.append(processed_frame)
     write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
-    return output_video_path

 import logging
+from typing import Any, Dict, List, Optional, Sequence, Tuple
 import cv2
 import numpy as np
+from models.model_loader import load_detector
+from mission_planner import MissionPlan, get_mission_plan
+from mission_summarizer import summarize_results
 from utils.video import extract_frames, write_video
     return output
+def _build_detection_records(
+    boxes: np.ndarray,
+    scores: Sequence[float],
+    labels: Sequence[int],
+    queries: Sequence[str],
+) -> List[Dict[str, Any]]:
+    detections: List[Dict[str, Any]] = []
+    for idx, box in enumerate(boxes):
+        label_idx = int(labels[idx]) if idx < len(labels) else -1
+        if 0 <= label_idx < len(queries):
+            label = queries[label_idx]
         else:
+            label = f"label_{label_idx}"
+        detections.append(
+            {
+                "label": label,
+                "score": float(scores[idx]) if idx < len(scores) else 0.0,
+                "bbox": [int(coord) for coord in box.tolist()],
+            }
+        )
+    return detections
+def infer_frame(frame: np.ndarray, queries: Sequence[str]) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
+    detector = load_detector()
+    text_queries = list(queries) or ["object"]
+    try:
+        result = detector.predict(frame, text_queries)
+        detections = _build_detection_records(result.boxes, result.scores, result.labels, text_queries)
     except Exception:
+        logging.exception("Inference failed for queries %s", text_queries)
         raise
+    return draw_boxes(frame, result.boxes), detections
 def run_inference(
     input_video_path: str,
     output_video_path: str,
+    mission_prompt: str,
     max_frames: Optional[int] = None,
+) -> Tuple[str, MissionPlan, str]:
     try:
         frames, fps, width, height = extract_frames(input_video_path)
     except ValueError as exc:
         logging.exception("Failed to decode video at %s", input_video_path)
         raise
+    mission_plan = get_mission_plan(mission_prompt)
+    logging.info("Mission plan: %s", mission_plan.to_json())
+    queries = mission_plan.queries()
     processed_frames: List[np.ndarray] = []
+    detection_log: List[Dict[str, Any]] = []
     for idx, frame in enumerate(frames):
         if max_frames is not None and idx >= max_frames:
             break
         logging.debug("Processing frame %d", idx)
+        processed_frame, detections = infer_frame(frame, queries)
+        detection_log.append({"frame_index": idx, "detections": detections})
         processed_frames.append(processed_frame)
     write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
+    mission_summary = summarize_results(mission_prompt, mission_plan, detection_log)
+    return output_video_path, mission_plan, mission_summary

mission_planner.py ADDED Viewed

	@@ -0,0 +1,211 @@

+from __future__ import annotations
+import json
+import logging
+from dataclasses import asdict, dataclass
+from typing import Dict, List, Tuple
+from utils.openai_client import get_openai_client
+YOLO_CLASSES: Tuple[str, ...] = (
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "dining table",
+    "toilet",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+)
+DEFAULT_OPENAI_MODEL = "gpt-4o-mini"
+@dataclass
+class MissionClass:
+    name: str
+    score: float
+    rationale: str
+@dataclass
+class MissionPlan:
+    mission: str
+    relevant_classes: List[MissionClass]
+    def queries(self) -> List[str]:
+        return [entry.name for entry in self.relevant_classes]
+    def to_dict(self) -> dict:
+        return {
+            "mission": self.mission,
+            "classes": [asdict(entry) for entry in self.relevant_classes],
+        }
+    def to_json(self) -> str:
+        return json.dumps(self.to_dict())
+class MissionReasoner:
+    def __init__(
+        self,
+        *,
+        model_name: str = DEFAULT_OPENAI_MODEL,
+        top_k: int = 10,
+    ) -> None:
+        self._model_name = model_name
+        self._top_k = top_k
+    def plan(self, mission: str) -> MissionPlan:
+        mission = (mission or "").strip()
+        if not mission:
+            raise ValueError("Mission prompt cannot be empty.")
+        response_payload = self._query_llm(mission)
+        relevant = self._parse_plan(response_payload, fallback_mission=mission)
+        return MissionPlan(mission=response_payload.get("mission", mission), relevant_classes=relevant[: self._top_k])
+    def _query_llm(self, mission: str) -> Dict[str, object]:
+        client = get_openai_client()
+        system_prompt = (
+            "You are a mission-planning assistant helping a vision system select which YOLO object classes to detect. "
+            "You must only reference the provided list of YOLO classes."
+        )
+        classes_blob = ", ".join(YOLO_CLASSES)
+        user_prompt = (
+            f"Mission: {mission}\n"
+            f"Available YOLO classes: {classes_blob}\n"
+            f"Return JSON with: mission (string) and classes (array). "
+            f"Each entry needs name, score (0-1 float), rationale. "
+            f"Limit to at most {self._top_k} classes. Only choose names from the list."
+        )
+        completion = client.chat.completions.create(
+            model=self._model_name,
+            temperature=0.2,
+            response_format={"type": "json_object"},
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ],
+        )
+        content = completion.choices[0].message.content or "{}"
+        try:
+            return json.loads(content)
+        except json.JSONDecodeError:
+            logging.exception("LLM returned non-JSON content: %s", content)
+            return {"mission": mission, "classes": []}
+    def _parse_plan(self, payload: Dict[str, object], fallback_mission: str) -> List[MissionClass]:
+        entries = payload.get("classes") or payload.get("relevant_classes") or []
+        mission = payload.get("mission") or fallback_mission
+        parsed: List[MissionClass] = []
+        seen = set()
+        for entry in entries:
+            if not isinstance(entry, dict):
+                continue
+            name = str(entry.get("name") or "").strip()
+            if not name or name not in YOLO_CLASSES or name in seen:
+                continue
+            seen.add(name)
+            score_raw = entry.get("score")
+            try:
+                score = float(score_raw)
+            except (TypeError, ValueError):
+                score = 0.5
+            rationale = str(entry.get("rationale") or f"Track '{name}' for mission '{mission}'.")
+            parsed.append(MissionClass(name=name, score=max(0.0, min(1.0, score)), rationale=rationale))
+        if not parsed:
+            logging.warning("LLM returned no usable classes. Falling back to default YOLO list.")
+            parsed = [
+                MissionClass(
+                    name=label,
+                    score=1.0 - (idx * 0.05),
+                    rationale=f"Fallback selection for mission '{mission}'.",
+                )
+                for idx, label in enumerate(YOLO_CLASSES[: self._top_k])
+            ]
+        return parsed
+_REASONER: MissionReasoner | None = None
+def get_mission_plan(mission: str) -> MissionPlan:
+    global _REASONER
+    if _REASONER is None:
+        _REASONER = MissionReasoner()
+    return _REASONER.plan(mission)

mission_summarizer.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from __future__ import annotations
+import json
+import logging
+from typing import Any, Dict, List
+from mission_planner import MissionPlan
+from utils.openai_client import get_openai_client
+SUMMARY_MODEL = "gpt-4o-mini"
+def _trim_detections(detections: List[Dict[str, Any]], max_boxes: int = 5) -> List[Dict[str, Any]]:
+    if len(detections) <= max_boxes:
+        return detections
+    return detections[:max_boxes]
+def _build_context_snapshot(records: List[Dict[str, Any]]) -> Dict[str, Any]:
+    if not records:
+        return {}
+    first = records[0]
+    middle = records[len(records) // 2]
+    last = records[-1]
+    return {
+        "first_frame": {
+            "frame_index": first["frame_index"],
+            "detections": _trim_detections(first.get("detections", [])),
+        },
+        "middle_frame": {
+            "frame_index": middle["frame_index"],
+            "detections": _trim_detections(middle.get("detections", [])),
+        },
+        "last_frame": {
+            "frame_index": last["frame_index"],
+            "detections": _trim_detections(last.get("detections", [])),
+        },
+    }
+def summarize_results(
+    mission_prompt: str,
+    mission_plan: MissionPlan,
+    detection_log: List[Dict[str, Any]],
+) -> str:
+    if not detection_log:
+        return "No detections were produced, so no summary is available."
+    context_snapshot = _build_context_snapshot(detection_log)
+    payload = {
+        "mission_prompt": mission_prompt,
+        "mission_plan": mission_plan.to_dict(),
+        "global_context": context_snapshot,
+        "detection_log": [
+            {
+                "frame_index": entry["frame_index"],
+                "detections": _trim_detections(entry.get("detections", []), max_boxes=8),
+            }
+            for entry in detection_log
+        ],
+    }
+    system_prompt = (
+        "You are a surveillance analyst. Review structured detections aligned to a mission and summarize actionable "
+        "insights, highlighting objects of interest, temporal trends, and any security concerns. "
+        "Base conclusions solely on the provided data; if nothing is detected, explicitly state that."
+    )
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {
+            "role": "user",
+            "content": (
+                "Use this JSON to summarize the mission outcome:\n"
+                f"{json.dumps(payload, ensure_ascii=False)}"
+            ),
+        },
+    ]
+    try:
+        client = get_openai_client()
+        completion = client.chat.completions.create(
+            model=SUMMARY_MODEL,
+            temperature=0.2,
+            messages=messages,
+        )
+        return (completion.choices[0].message.content or "").strip()
+    except Exception:
+        logging.exception("Failed to generate mission summary.")
+        return "Mission summary generation failed."

models/detectors/base.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from typing import NamedTuple, Sequence
+import numpy as np
+class DetectionResult(NamedTuple):
+    boxes: np.ndarray
+    scores: Sequence[float]
+    labels: Sequence[int]
+class ObjectDetector:
+    """Detector interface to keep inference agnostic to model details."""
+    name: str
+    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        raise NotImplementedError

models/detectors/owlv2.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import logging
+from typing import Sequence
+import numpy as np
+import torch
+from transformers import Owlv2ForObjectDetection, Owlv2Processor
+from models.detectors.base import DetectionResult, ObjectDetector
+class Owlv2Detector(ObjectDetector):
+    MODEL_NAME = "google/owlv2-large-patch14"
+    def __init__(self) -> None:
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logging.info("Loading %s onto %s", self.MODEL_NAME, self.device)
+        self.processor = Owlv2Processor.from_pretrained(self.MODEL_NAME)
+        torch_dtype = torch.float16 if self.device.type == "cuda" else torch.float32
+        self.model = Owlv2ForObjectDetection.from_pretrained(
+            self.MODEL_NAME, torch_dtype=torch_dtype
+        )
+        self.model.to(self.device)
+        self.model.eval()
+        self.name = "owlv2"
+    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        inputs = self.processor(text=queries, images=frame, return_tensors="pt")
+        if hasattr(inputs, "to"):
+            inputs = inputs.to(self.device)
+        else:
+            inputs = {
+                key: value.to(self.device) if hasattr(value, "to") else value
+                for key, value in inputs.items()
+            }
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        processed = self.processor.post_process_object_detection(
+            outputs, threshold=0.3, target_sizes=[frame.shape[:2]]
+        )[0]
+        boxes = processed["boxes"]
+        scores = processed.get("scores", [])
+        labels = processed.get("labels", [])
+        boxes_np = boxes.cpu().numpy() if hasattr(boxes, "cpu") else np.asarray(boxes)
+        if hasattr(scores, "cpu"):
+            scores_seq = scores.cpu().numpy().tolist()
+        elif isinstance(scores, np.ndarray):
+            scores_seq = scores.tolist()
+        else:
+            scores_seq = list(scores)
+        if hasattr(labels, "cpu"):
+            labels_seq = labels.cpu().numpy().tolist()
+        elif isinstance(labels, np.ndarray):
+            labels_seq = labels.tolist()
+        else:
+            labels_seq = list(labels)
+        return DetectionResult(boxes=boxes_np, scores=scores_seq, labels=labels_seq)

models/model_loader.py CHANGED Viewed

@@ -1,20 +1,37 @@
-import logging
-from typing import Tuple
-import torch
-from transformers import Owlv2ForObjectDetection, Owlv2Processor
-MODEL_NAME = "google/owlv2-large-patch14"
-_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-logging.info("Loading %s onto %s", MODEL_NAME, _DEVICE)
-_PROCESSOR = Owlv2Processor.from_pretrained(MODEL_NAME)
-torch_dtype = torch.float16 if _DEVICE.type == "cuda" else torch.float32
-_MODEL = Owlv2ForObjectDetection.from_pretrained(MODEL_NAME, torch_dtype=torch_dtype)
-_MODEL.to(_DEVICE)
-_MODEL.eval()
-def load_model() -> Tuple[Owlv2Processor, Owlv2ForObjectDetection, torch.device]:
-    """Expose processor/model singletons so the API never reloads weights."""
-    return _PROCESSOR, _MODEL, _DEVICE

+import os
+from functools import lru_cache
+from typing import Callable, Dict, Optional
+from models.detectors.base import ObjectDetector
+from models.detectors.owlv2 import Owlv2Detector
+DEFAULT_DETECTOR = "owlv2"
+_REGISTRY: Dict[str, Callable[[], ObjectDetector]] = {
+    "owlv2": Owlv2Detector,
+}
+def _create_detector(name: str) -> ObjectDetector:
+    try:
+        factory = _REGISTRY[name]
+    except KeyError as exc:
+        available = ", ".join(sorted(_REGISTRY))
+        raise ValueError(f"Unknown detector '{name}'. Available: {available}") from exc
+    return factory()
+@lru_cache(maxsize=None)
+def _get_cached_detector(name: str) -> ObjectDetector:
+    return _create_detector(name)
+def load_detector(name: Optional[str] = None) -> ObjectDetector:
+    """Return a cached detector instance selected via arg or OBJECT_DETECTOR env."""
+    detector_name = name or os.getenv("OBJECT_DETECTOR", DEFAULT_DETECTOR)
+    return _get_cached_detector(detector_name)
+# Backwards compatibility for existing callers.
+def load_model():
+    return load_detector()

requirements.txt CHANGED Viewed

@@ -7,3 +7,4 @@ python-multipart
 accelerate
 pillow
 scipy

 accelerate
 pillow
 scipy
+openai

utils/openai_client.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from __future__ import annotations
+import os
+from pathlib import Path
+from typing import Dict
+from openai import OpenAI
+ENV_FILE_NAME = ".env"
+_ENV_KEY_CANDIDATES = (
+    "OPENAI_API_KEY",
+    "OpenAI_API_KEY",
+    "OpenAI-API",
+    "OpenAI_API",
+    "OPENAIKEY",
+)
+_OPENAI_CLIENT: OpenAI | None = None
+def _read_env_file(path: Path) -> Dict[str, str]:
+    entries: Dict[str, str] = {}
+    if not path.exists():
+        return entries
+    for raw_line in path.read_text().splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#"):
+            continue
+        if "=" in line:
+            key, value = line.split("=", 1)
+        elif ":" in line:
+            key, value = line.split(":", 1)
+        else:
+            continue
+        entries[key.strip()] = value.strip().strip('"').strip("'")
+    return entries
+def ensure_openai_api_key() -> str:
+    key = os.getenv("OPENAI_API_KEY")
+    if key:
+        return key
+    env_path = Path(__file__).resolve().parent.parent / ENV_FILE_NAME
+    env_entries = _read_env_file(env_path)
+    for candidate in _ENV_KEY_CANDIDATES:
+        if env_entries.get(candidate):
+            key = env_entries[candidate]
+            break
+    else:
+        key = None
+    if not key:
+        raise RuntimeError(
+            "OpenAI API key is not configured. Set OPENAI_API_KEY or add it to .env (e.g., 'OpenAI-API: sk-...')."
+        )
+    os.environ["OPENAI_API_KEY"] = key
+    return key
+def get_openai_client() -> OpenAI:
+    global _OPENAI_CLIENT
+    if _OPENAI_CLIENT is None:
+        api_key = ensure_openai_api_key()
+        _OPENAI_CLIENT = OpenAI(api_key=api_key)
+    return _OPENAI_CLIENT