zye0616 commited on
Commit
51780f2
·
1 Parent(s): 05e7070

fix:frontend video display

Browse files
Files changed (4) hide show
  1. README.md +4 -5
  2. app.py +50 -8
  3. demo.html +35 -12
  4. inference.py +15 -5
README.md CHANGED
@@ -12,8 +12,7 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
12
 
13
  ## Mission-guided detections
14
 
15
- 1. Send a `POST /process_video` request with fields `video` (file) and `prompt` (mission text). Optionally include `detector` (`owlv2` or `hf_yolov8`) to pick the backend per request; if omitted the server uses its default/`OBJECT_DETECTOR` setting.
16
- 2. The backend feeds the mission text into an OpenAI (`gpt-4o-mini`) reasoning step that scores and ranks every YOLO/COCO class. Place your API key inside `.env` as either `OPENAI_API_KEY=...` or `OpenAI-API: ...`; the server loads it automatically on startup.
17
- 3. The top scored classes become the textual queries for the existing OWLv2 detector so the detections align with the mission.
18
- 4. After object detection finishes, another OpenAI call ingests the detection log plus the first/middle/last frame context and produces a natural-language summary of the mission outcome.
19
- 5. The HTTP response still streams the processed video, and it now embeds the structured mission plan (`x-mission-plan`) and text summary (`x-mission-summary`) in the headers.
 
12
 
13
  ## Mission-guided detections
14
 
15
+ 1. Call `POST /process_video` with fields `video` (file), `prompt` (mission text), and optional `detector` (`owlv2` or `hf_yolov8`). The response is an MP4 stream containing the annotated frames.
16
+ 2. Call `POST /mission_summary` with the same fields to receive JSON containing the structured mission plan plus the natural-language summary. This second endpoint isolates the OpenAI call, keeping the video response clean.
17
+ 3. Under the hood the mission text still feeds into the OpenAI (`gpt-4o-mini`) reasoning step that ranks the YOLO/COCO classes. Place your API key inside `.env` as either `OPENAI_API_KEY=...` or `OpenAI-API: ...`; the server loads it automatically on startup.
18
+ 4. The top scored classes drive OWLv2 or YOLOv8 to align detections with the mission, and the detection log is summarized via another OpenAI call when requested.
 
app.py CHANGED
@@ -50,6 +50,13 @@ def _schedule_cleanup(background_tasks: BackgroundTasks, path: str) -> None:
50
  background_tasks.add_task(_cleanup)
51
 
52
 
 
 
 
 
 
 
 
53
  @app.post("/process_video")
54
  async def process_video(
55
  background_tasks: BackgroundTasks,
@@ -57,10 +64,7 @@ async def process_video(
57
  prompt: str = Form(...),
58
  detector: Optional[str] = Form(None),
59
  ):
60
- if video is None:
61
- raise HTTPException(status_code=400, detail="Video file is required.")
62
- if not prompt:
63
- raise HTTPException(status_code=400, detail="Prompt is required.")
64
 
65
  try:
66
  input_path = _save_upload_to_tmp(video)
@@ -74,12 +78,13 @@ async def process_video(
74
  os.close(fd)
75
 
76
  try:
77
- output_path, mission_plan, mission_summary = run_inference(
78
  input_path,
79
  output_path,
80
  prompt,
81
  max_frames=10,
82
  detector_name=detector,
 
83
  )
84
  except ValueError as exc:
85
  logging.exception("Video decoding failed.")
@@ -100,11 +105,49 @@ async def process_video(
100
  media_type="video/mp4",
101
  filename="processed.mp4",
102
  )
103
- response.headers["x-mission-plan"] = mission_plan.to_json()
104
- response.headers["x-mission-summary"] = mission_summary.replace("\n", " ").strip()
105
  return response
106
 
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  if __name__ == "__main__":
109
  uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
110
  @app.get("/", response_class=HTMLResponse)
@@ -114,4 +157,3 @@ async def demo_page() -> str:
114
  return demo_path.read_text(encoding="utf-8")
115
  except FileNotFoundError:
116
  return "<h1>Demo page missing</h1>"
117
-
 
50
  background_tasks.add_task(_cleanup)
51
 
52
 
53
+ def _validate_inputs(video: UploadFile | None, prompt: str | None) -> None:
54
+ if video is None:
55
+ raise HTTPException(status_code=400, detail="Video file is required.")
56
+ if not prompt:
57
+ raise HTTPException(status_code=400, detail="Prompt is required.")
58
+
59
+
60
  @app.post("/process_video")
61
  async def process_video(
62
  background_tasks: BackgroundTasks,
 
64
  prompt: str = Form(...),
65
  detector: Optional[str] = Form(None),
66
  ):
67
+ _validate_inputs(video, prompt)
 
 
 
68
 
69
  try:
70
  input_path = _save_upload_to_tmp(video)
 
78
  os.close(fd)
79
 
80
  try:
81
+ output_path, _, _ = run_inference(
82
  input_path,
83
  output_path,
84
  prompt,
85
  max_frames=10,
86
  detector_name=detector,
87
+ generate_summary=False,
88
  )
89
  except ValueError as exc:
90
  logging.exception("Video decoding failed.")
 
105
  media_type="video/mp4",
106
  filename="processed.mp4",
107
  )
 
 
108
  return response
109
 
110
 
111
+ @app.post("/mission_summary")
112
+ async def mission_summary(
113
+ video: UploadFile = File(...),
114
+ prompt: str = Form(...),
115
+ detector: Optional[str] = Form(None),
116
+ ):
117
+ _validate_inputs(video, prompt)
118
+ try:
119
+ input_path = _save_upload_to_tmp(video)
120
+ except Exception:
121
+ logging.exception("Failed to save uploaded file.")
122
+ raise HTTPException(status_code=500, detail="Failed to save uploaded video.")
123
+ finally:
124
+ await video.close()
125
+
126
+ try:
127
+ _, mission_plan, mission_summary = run_inference(
128
+ input_path,
129
+ output_video_path=None,
130
+ mission_prompt=prompt,
131
+ max_frames=10,
132
+ detector_name=detector,
133
+ write_output_video=False,
134
+ generate_summary=True,
135
+ )
136
+ except ValueError as exc:
137
+ logging.exception("Video decoding failed.")
138
+ _safe_delete(input_path)
139
+ raise HTTPException(status_code=500, detail=str(exc))
140
+ except Exception as exc:
141
+ logging.exception("Summary generation failed.")
142
+ _safe_delete(input_path)
143
+ return JSONResponse(status_code=500, content={"error": str(exc)})
144
+
145
+ _safe_delete(input_path)
146
+ return {
147
+ "mission_plan": mission_plan.to_dict(),
148
+ "mission_summary": mission_summary or "",
149
+ }
150
+
151
  if __name__ == "__main__":
152
  uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
153
  @app.get("/", response_class=HTMLResponse)
 
157
  return demo_path.read_text(encoding="utf-8")
158
  except FileNotFoundError:
159
  return "<h1>Demo page missing</h1>"
 
demo.html CHANGED
@@ -140,7 +140,9 @@ button:hover {
140
  </div>
141
 
142
  <script>
143
- const PROCESS_VIDEO_URL = "https://biaslab2025-demo-2025.hf.space/process_video";
 
 
144
 
145
  async function executeMission() {
146
 
@@ -155,18 +157,18 @@ async function executeMission() {
155
  return;
156
  }
157
 
158
- const formData = new FormData();
159
- formData.append("video", videoFile);
160
- formData.append("prompt", mission);
161
- formData.append("detector", detector);
162
-
163
- statusEl.textContent = "Dispatching mission to backend...";
164
- summaryEl.textContent = "(Processing...)";
165
 
166
  try {
 
 
 
 
 
167
  const response = await fetch(PROCESS_VIDEO_URL, {
168
  method: "POST",
169
- body: formData
170
  });
171
 
172
  if (!response.ok) {
@@ -180,14 +182,35 @@ async function executeMission() {
180
  throw new Error(errorDetail);
181
  }
182
 
183
- const missionSummary = response.headers.get("x-mission-summary") || "No summary returned.";
184
- summaryEl.textContent = missionSummary;
185
-
186
  const videoBlob = await response.blob();
187
  const videoUrl = URL.createObjectURL(videoBlob);
188
  const videoEl = document.getElementById("processedVideo");
189
  videoEl.src = videoUrl;
190
  videoEl.load();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  statusEl.textContent = "Mission complete.";
192
  } catch (err) {
193
  console.error(err);
 
140
  </div>
141
 
142
  <script>
143
+ const API_BASE_URL = "https://biaslab2025-demo-2025.hf.space";
144
+ const PROCESS_VIDEO_URL = `${API_BASE_URL}/process_video`;
145
+ const SUMMARY_URL = `${API_BASE_URL}/mission_summary`;
146
 
147
  async function executeMission() {
148
 
 
157
  return;
158
  }
159
 
160
+ statusEl.textContent = "Processing video...";
161
+ summaryEl.textContent = "(Awaiting summary...)";
 
 
 
 
 
162
 
163
  try {
164
+ const videoForm = new FormData();
165
+ videoForm.append("video", videoFile);
166
+ videoForm.append("prompt", mission);
167
+ videoForm.append("detector", detector);
168
+
169
  const response = await fetch(PROCESS_VIDEO_URL, {
170
  method: "POST",
171
+ body: videoForm
172
  });
173
 
174
  if (!response.ok) {
 
182
  throw new Error(errorDetail);
183
  }
184
 
 
 
 
185
  const videoBlob = await response.blob();
186
  const videoUrl = URL.createObjectURL(videoBlob);
187
  const videoEl = document.getElementById("processedVideo");
188
  videoEl.src = videoUrl;
189
  videoEl.load();
190
+
191
+ statusEl.textContent = "Generating summary...";
192
+
193
+ const summaryForm = new FormData();
194
+ summaryForm.append("video", videoFile);
195
+ summaryForm.append("prompt", mission);
196
+ summaryForm.append("detector", detector);
197
+
198
+ const summaryResponse = await fetch(SUMMARY_URL, {
199
+ method: "POST",
200
+ body: summaryForm
201
+ });
202
+ if (!summaryResponse.ok) {
203
+ let errorDetail = `Summary failed (${summaryResponse.status})`;
204
+ try {
205
+ const errJson = await summaryResponse.json();
206
+ errorDetail = errJson.error || errorDetail;
207
+ } catch (_) {}
208
+ throw new Error(errorDetail);
209
+ }
210
+
211
+ const summaryJson = await summaryResponse.json();
212
+ const summaryText = summaryJson.mission_summary || "No summary returned.";
213
+ summaryEl.textContent = summaryText;
214
  statusEl.textContent = "Mission complete.";
215
  } catch (err) {
216
  console.error(err);
inference.py CHANGED
@@ -66,11 +66,13 @@ def infer_frame(
66
 
67
  def run_inference(
68
  input_video_path: str,
69
- output_video_path: str,
70
  mission_prompt: str,
71
  max_frames: Optional[int] = None,
72
  detector_name: Optional[str] = None,
73
- ) -> Tuple[str, MissionPlan, str]:
 
 
74
  try:
75
  frames, fps, width, height = extract_frames(input_video_path)
76
  except ValueError as exc:
@@ -91,6 +93,14 @@ def run_inference(
91
  detection_log.append({"frame_index": idx, "detections": detections})
92
  processed_frames.append(processed_frame)
93
 
94
- write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
95
- mission_summary = summarize_results(mission_prompt, mission_plan, detection_log)
96
- return output_video_path, mission_plan, mission_summary
 
 
 
 
 
 
 
 
 
66
 
67
  def run_inference(
68
  input_video_path: str,
69
+ output_video_path: Optional[str],
70
  mission_prompt: str,
71
  max_frames: Optional[int] = None,
72
  detector_name: Optional[str] = None,
73
+ write_output_video: bool = True,
74
+ generate_summary: bool = True,
75
+ ) -> Tuple[Optional[str], MissionPlan, Optional[str]]:
76
  try:
77
  frames, fps, width, height = extract_frames(input_video_path)
78
  except ValueError as exc:
 
93
  detection_log.append({"frame_index": idx, "detections": detections})
94
  processed_frames.append(processed_frame)
95
 
96
+ if write_output_video:
97
+ if not output_video_path:
98
+ raise ValueError("output_video_path is required when write_output_video=True.")
99
+ write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
100
+ video_path_result: Optional[str] = output_video_path
101
+ else:
102
+ video_path_result = None
103
+ mission_summary = (
104
+ summarize_results(mission_prompt, mission_plan, detection_log) if generate_summary else None
105
+ )
106
+ return video_path_result, mission_plan, mission_summary