Spaces:

DawnC
/

VisionScout

Running on Zero

App Files Files Community

DawnC commited on Jun 8

Commit

6197eab

verified ·

1 Parent(s): da70c52

Upload 3 files

Browse files

update video processor function, now can know the time line of objects and fixed error issues

Files changed (3) hide show

app.py +166 -63
ui_manager.py +212 -141
video_processor.py +500 -295

app.py CHANGED Viewed

@@ -8,6 +8,8 @@ import cv2
 from PIL import Image
 import tempfile
 import uuid
 import spaces
 from detection_model import DetectionModel
@@ -27,7 +29,7 @@ ui_manager = None
 def initialize_processors():
     """
     Initialize the image and video processors with LLM support.
     Returns:
         bool: True if initialization was successful, False otherwise
     """
@@ -49,8 +51,9 @@ def initialize_processors():
         else:
             print("WARNING: scene_analyzer attribute not found in image_processor")
-        video_processor = VideoProcessor(image_processor)
-        print("VideoProcessor initialized successfully")
         return True
     except Exception as e:
@@ -62,7 +65,7 @@ def initialize_processors():
         try:
             print("Attempting fallback initialization without LLM...")
             image_processor = ImageProcessor(use_llm=False, enable_places365=False)
-            video_processor = VideoProcessor(image_processor)
             print("Fallback processors initialized successfully without LLM and Places365")
             return True
@@ -77,25 +80,25 @@ def initialize_processors():
 def initialize_ui_manager():
     """
     Initialize the UI manager and set up references to processors.
     Returns:
         UIManager: Initialized UI manager instance
     """
     global ui_manager, image_processor
     ui_manager = UIManager()
     # Set image processor reference for dynamic class retrieval
     if image_processor:
         ui_manager.set_image_processor(image_processor)
     return ui_manager
 @spaces.GPU(duration=180)
 def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True, enable_landmark=True):
     """
     Processes a single uploaded image.
     Args:
         image: PIL Image object
         model_name: Name of the YOLO model to use
@@ -103,10 +106,10 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
         filter_classes: List of class names/IDs to filter
         use_llm: Whether to use LLM for enhanced descriptions
         enable_landmark: Whether to enable landmark detection
     Returns:
-        Tuple: (result_image, result_text, formatted_stats, plot_figure,
-                scene_description_html, original_desc_html, activities_list_data,
                 safety_data, zones, lighting)
     """
     # Enhanced safety check for image_processor
@@ -140,7 +143,7 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
     print(f"DIAGNOSTIC: Image upload handled with enable_landmark={enable_landmark}, use_llm={use_llm}")
     print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}, enable_landmark: {enable_landmark}")
     try:
         image_processor.use_llm = use_llm
@@ -366,7 +369,7 @@ def handle_image_upload(image, model_name, confidence_threshold, filter_classes=
         </div>
         '''
-        # 原始描述只在使用 LLM 且有增強描述時在折疊區顯示
         original_desc_visibility = "block" if use_llm and enhanced_description else "none"
         original_desc_html = f'''
         <div id="original_scene_analysis_accordion" style="display: {original_desc_visibility};">
@@ -483,95 +486,195 @@ def download_video_from_url(video_url, max_duration_minutes=10):
         print(f"Error downloading video: {e}\n{error_details}")
         return None, f"Error downloading video: {str(e)}"
 @spaces.GPU
-def handle_video_upload(video_input, video_url, input_type, model_name, confidence_threshold, process_interval):
     """
-    Handles video upload or URL input and calls the VideoProcessor.
     Args:
-        video_input: Uploaded video file
-        video_url: Video URL (if using URL input)
-        input_type: Type of input ("upload" or "url")
-        model_name: Name of the YOLO model to use
-        confidence_threshold: Confidence threshold for detections
-        process_interval: Frame processing interval
     Returns:
         Tuple: (output_video_path, summary_html, formatted_stats)
     """
-    print(f"Received video request: input_type={input_type}")
-    video_path = None
-    # Handle based on input type
     if input_type == "upload" and video_input:
-        print(f"Processing uploaded video file")
         video_path = video_input
     elif input_type == "url" and video_url:
         print(f"Processing video from URL: {video_url}")
-        # Download video from URL
-        video_path, error_message = download_video_from_url(video_url)
-        if error_message:
-            error_html = f"<div class='video-summary-content-wrapper'><pre>{error_message}</pre></div>"
-            return None, error_html, {"error": error_message}
-    else:
-        print("No valid video input provided.")
-        return None, "<div class='video-summary-content-wrapper'><pre>Please upload a video file or provide a valid video URL.</pre></div>", {}
-    print(f"Starting video processing with: model={model_name}, confidence={confidence_threshold}, interval={process_interval}")
     try:
-        # Call the VideoProcessor method
-        output_video_path, summary_text, stats_dict = video_processor.process_video_file(
             video_path=video_path,
             model_name=model_name,
             confidence_threshold=confidence_threshold,
-            process_interval=int(process_interval) # Ensure interval is int
         )
-        print(f"Video processing function returned: path={output_video_path}, summary length={len(summary_text)}")
-        # Wrap processing summary in HTML tags for consistent styling with scene understanding page
-        summary_html = f"<div class='video-summary-content-wrapper'><pre>{summary_text}</pre></div>"
-        # Format statistics for better display
-        formatted_stats = {}
-        if stats_dict and isinstance(stats_dict, dict):
-            formatted_stats = stats_dict
-        return output_video_path, summary_html, formatted_stats
     except Exception as e:
         print(f"Error in handle_video_upload: {e}")
-        import traceback
-        error_msg = f"Error processing video: {str(e)}\n{traceback.format_exc()}"
         error_html = f"<div class='video-summary-content-wrapper'><pre>{error_msg}</pre></div>"
         return None, error_html, {"error": str(e)}
 def main():
-    """
-    Main function to initialize processors and launch the Gradio interface.
-    """
     global ui_manager
-    # Initialize processors
     print("Initializing processors...")
     initialization_success = initialize_processors()
     if not initialization_success:
-        print("WARNING: Failed to initialize processors. Application may not function correctly.")
         return
-    # Initialize UI manager
     print("Initializing UI manager...")
     ui_manager = initialize_ui_manager()
-    # Create and launch the Gradio interface
     print("Creating Gradio interface...")
     demo_interface = ui_manager.create_interface(
         handle_image_upload_fn=handle_image_upload,
         handle_video_upload_fn=handle_video_upload,
         download_video_from_url_fn=download_video_from_url
     )
     print("Launching application...")
     demo_interface.launch(debug=True)

 from PIL import Image
 import tempfile
 import uuid
+import time
+import traceback
 import spaces
 from detection_model import DetectionModel
 def initialize_processors():
     """
     Initialize the image and video processors with LLM support.
     Returns:
         bool: True if initialization was successful, False otherwise
     """
         else:
             print("WARNING: scene_analyzer attribute not found in image_processor")
+        # 初始化獨立的VideoProcessor
+        video_processor = VideoProcessor()
+        print("VideoProcessor initialized successfully as independent module")
         return True
     except Exception as e:
         try:
             print("Attempting fallback initialization without LLM...")
             image_processor = ImageProcessor(use_llm=False, enable_places365=False)
+            video_processor = VideoProcessor()
             print("Fallback processors initialized successfully without LLM and Places365")
             return True
 def initialize_ui_manager():
     """
     Initialize the UI manager and set up references to processors.
     Returns:
         UIManager: Initialized UI manager instance
     """
     global ui_manager, image_processor
     ui_manager = UIManager()
     # Set image processor reference for dynamic class retrieval
     if image_processor:
         ui_manager.set_image_processor(image_processor)
     return ui_manager
 @spaces.GPU(duration=180)
 def handle_image_upload(image, model_name, confidence_threshold, filter_classes=None, use_llm=True, enable_landmark=True):
     """
     Processes a single uploaded image.
     Args:
         image: PIL Image object
         model_name: Name of the YOLO model to use
         filter_classes: List of class names/IDs to filter
         use_llm: Whether to use LLM for enhanced descriptions
         enable_landmark: Whether to enable landmark detection
     Returns:
+        Tuple: (result_image, result_text, formatted_stats, plot_figure,
+                scene_description_html, original_desc_html, activities_list_data,
                 safety_data, zones, lighting)
     """
     # Enhanced safety check for image_processor
     print(f"DIAGNOSTIC: Image upload handled with enable_landmark={enable_landmark}, use_llm={use_llm}")
     print(f"Processing image with model: {model_name}, confidence: {confidence_threshold}, use_llm: {use_llm}, enable_landmark: {enable_landmark}")
     try:
         image_processor.use_llm = use_llm
         </div>
         '''
+        # 原始描述只在使用 LLM 且有增強敘述時會在折疊區顯示
         original_desc_visibility = "block" if use_llm and enhanced_description else "none"
         original_desc_html = f'''
         <div id="original_scene_analysis_accordion" style="display: {original_desc_visibility};">
         print(f"Error downloading video: {e}\n{error_details}")
         return None, f"Error downloading video: {str(e)}"
+def generate_basic_video_summary(analysis_results: Dict) -> str:
+    """
+    生成基本的視頻統計摘要
+    Args:
+        analysis_results (Dict): 新的分析結果結構
+    Returns:
+        str: 詳細的統計摘要
+    """
+    summary_lines = ["=== Video Analysis Summary ===", ""]
+    # process info
+    processing_info = analysis_results.get("processing_info", {})
+    duration = processing_info.get("video_duration_seconds", 0)
+    total_frames = processing_info.get("total_frames", 0)
+    analyzed_frames = processing_info.get("frames_analyzed", 0)
+    summary_lines.extend([
+        f"Video Duration: {duration:.1f} seconds ({total_frames} total frames)",
+        f"Frames Analyzed: {analyzed_frames} frames (every {processing_info.get('processing_interval', 1)} frames)",
+        ""
+    ])
+    # object detected summary
+    object_summary = analysis_results.get("object_summary", {})
+    total_objects = object_summary.get("total_unique_objects_detected", 0)
+    object_types = object_summary.get("object_types_found", 0)
+    summary_lines.extend([
+        f"Objects Detected: {total_objects} total objects across {object_types} categories",
+        ""
+    ])
+    # detailed counting number
+    detailed_counts = object_summary.get("detailed_counts", {})
+    if detailed_counts:
+        summary_lines.extend([
+            "Object Breakdown:",
+            *[f"  • {count} {name}(s)" for name, count in detailed_counts.items()],
+            ""
+        ])
+    # 實用分析摘要
+    practical_analytics = analysis_results.get("practical_analytics", {})
+    # 物體密度分析
+    density_info = practical_analytics.get("object_density", {})
+    if density_info:
+        objects_per_min = density_info.get("objects_per_minute", 0)
+        peak_periods = density_info.get("peak_activity_periods", [])
+        summary_lines.extend([
+            f"Activity Level: {objects_per_min:.1f} objects per minute",
+            f"Peak Activity Periods: {len(peak_periods)} identified",
+            ""
+        ])
+    # 場景適合性
+    scene_info = practical_analytics.get("scene_appropriateness", {})
+    if scene_info.get("scene_detected", False):
+        scene_name = scene_info.get("scene_name", "unknown")
+        appropriateness = scene_info.get("appropriateness_score", 0)
+        summary_lines.extend([
+            f"Scene Type: {scene_name}",
+            f"Object-Scene Compatibility: {appropriateness:.1%}",
+            ""
+        ])
+    # 品質指標
+    quality_info = practical_analytics.get("quality_metrics", {})
+    if quality_info:
+        quality_grade = quality_info.get("quality_grade", "unknown")
+        overall_confidence = quality_info.get("overall_confidence", 0)
+        summary_lines.extend([
+            f"Detection Quality: {quality_grade.title()} (avg confidence: {overall_confidence:.3f})",
+            ""
+        ])
+    summary_lines.append(f"Processing completed in {processing_info.get('processing_time_seconds', 0):.1f} seconds.")
+    return "\n".join(summary_lines)
 @spaces.GPU
+def handle_video_upload(video_input, video_url, input_type, model_name,
+                       confidence_threshold, process_interval):
     """
+    處理影片上傳的函數
     Args:
+        video_input: 上傳的視頻文件
+        video_url: 視頻URL（如果使用URL輸入）
+        input_type: 輸入類型（"upload" 或 "url"）
+        model_name: YOLO模型名稱
+        confidence_threshold: 置信度閾值
+        process_interval: 處理間隔（每N幀處理一次）
     Returns:
         Tuple: (output_video_path, summary_html, formatted_stats)
     """
+    if video_processor is None:
+        error_msg = "Error: Video processor not initialized."
+        error_html = f"<div class='video-summary-content-wrapper'><pre>{error_msg}</pre></div>"
+        return None, error_html, {"error": "Video processor not available"}
+    video_path = None
+    # 根據輸入類型處理
     if input_type == "upload" and video_input:
         video_path = video_input
+        print(f"Processing uploaded video file: {video_path}")
     elif input_type == "url" and video_url:
         print(f"Processing video from URL: {video_url}")
+        video_path, error_msg = download_video_from_url(video_url)
+        if error_msg:
+            error_html = f"<div class='video-summary-content-wrapper'><pre>{error_msg}</pre></div>"
+            return None, error_html, {"error": error_msg}
+    if not video_path:
+        error_msg = "Please provide a video file or valid URL."
+        error_html = f"<div class='video-summary-content-wrapper'><pre>{error_msg}</pre></div>"
+        return None, error_html, {"error": "No video input provided"}
+    print(f"Starting practical video analysis: model={model_name}, confidence={confidence_threshold}, interval={process_interval}")
+    processing_start_time = time.time()
     try:
+        output_video_path, analysis_results = video_processor.process_video(
             video_path=video_path,
             model_name=model_name,
             confidence_threshold=confidence_threshold,
+            process_interval=int(process_interval)
         )
+        print(f"Video processing function returned: path={output_video_path}")
+        if output_video_path is None:
+            error_msg = analysis_results.get("error", "Unknown error occurred during video processing")
+            error_html = f"<div class='video-summary-content-wrapper'><pre>Processing failed: {error_msg}</pre></div>"
+            return None, error_html, analysis_results
+        # 生成摘要，直接用統計數據
+        basic_summary = generate_basic_video_summary(analysis_results)
+        # Final Result
+        processing_time = time.time() - processing_start_time
+        processing_info = analysis_results.get("processing_info", {})
+        summary_lines = [
+            f"Video processing completed in {processing_time:.2f} seconds.",
+            f"Analyzed {processing_info.get('frames_analyzed', 0)} frames out of {processing_info.get('total_frames', 0)} total frames.",
+            f"Processing interval: every {process_interval} frames",
+            basic_summary
+        ]
+        summary_content = '\n'.join(summary_lines)
+        summary_html = f"<div class='video-summary-content-wrapper'><pre>{summary_content}</pre></div>"
+        return output_video_path, summary_html, analysis_results
     except Exception as e:
         print(f"Error in handle_video_upload: {e}")
+        traceback.print_exc()
+        error_msg = f"影片處理失敗: {str(e)}"
         error_html = f"<div class='video-summary-content-wrapper'><pre>{error_msg}</pre></div>"
         return None, error_html, {"error": str(e)}
 def main():
+    """主函數，初始化並啟動Gradio"""
     global ui_manager
+    print("=== VisionScout Application Starting ===")
     print("Initializing processors...")
     initialization_success = initialize_processors()
     if not initialization_success:
+        print("ERROR: Failed to initialize processors. Application cannot start.")
         return
     print("Initializing UI manager...")
     ui_manager = initialize_ui_manager()
     print("Creating Gradio interface...")
     demo_interface = ui_manager.create_interface(
         handle_image_upload_fn=handle_image_upload,
         handle_video_upload_fn=handle_video_upload,
         download_video_from_url_fn=download_video_from_url
     )
     print("Launching application...")
     demo_interface.launch(debug=True)

ui_manager.py CHANGED Viewed

@@ -7,17 +7,17 @@ from style import Style
 class UIManager:
     """
-    Manages all UI-related functionality for the VisionScout application.
     Handles Gradio interface creation, component definitions, and event binding.
     """
     def __init__(self):
         """Initialize the UI Manager."""
         self.available_models = None
         self.model_choices = []
         self.class_choices_formatted = []
         self._setup_model_choices()
     def _setup_model_choices(self):
         """Setup model choices for dropdowns."""
         try:
@@ -26,14 +26,14 @@ class UIManager:
         except ImportError:
             # Fallback model choices if DetectionModel is not available
             self.model_choices = ["yolov8n.pt", "yolov8s.pt", "yolov8m.pt", "yolov8l.pt", "yolov8x.pt"]
         # Setup class choices
         self.class_choices_formatted = [f"{id}: {name}" for id, name in self.get_all_classes()]
     def get_all_classes(self):
         """
         Gets all available COCO classes.
         Returns:
             List[Tuple[int, str]]: List of (class_id, class_name) tuples
         """
@@ -52,7 +52,7 @@ class UIManager:
         except Exception:
             pass
-        # Fallback to standard COCO (ensure keys are ints)
         default_classes = {
             0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
             6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
@@ -72,27 +72,27 @@ class UIManager:
             77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
         }
         return sorted(default_classes.items())
     def set_image_processor(self, image_processor):
         """
         Set the image processor reference for dynamic class retrieval.
         Args:
             image_processor: The ImageProcessor instance
         """
         self._image_processor = image_processor
     def get_css_styles(self):
         """
         Get CSS styles for the interface.
         Returns:
             str: CSS styles
         """
         try:
             return Style.get_css()
         except ImportError:
-            # Fallback CSS if Style module is not available
             return """
             .app-header {
                 text-align: center;
@@ -111,15 +111,23 @@ class UIManager:
                 border: none !important;
                 border-radius: 8px !important;
             }
             """
     def get_model_description(self, model_name):
         """
         Get model description for the given model name.
         Args:
             model_name: Name of the model
         Returns:
             str: Model description
         """
@@ -127,11 +135,11 @@ class UIManager:
             return DetectionModel.get_model_description(model_name)
         except ImportError:
             return f"Model: {model_name}"
     def create_header(self):
         """
         Create the application header.
         Returns:
             gr.HTML: Header HTML component
         """
@@ -142,7 +150,7 @@ class UIManager:
                 <div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;"><div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div></div>
                 <div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
                     <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🖼️</span> Image Analysis</div>
-                    <div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🎬</span> Video Analysis</div>
                 </div>
                  <div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
                      <p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
@@ -152,18 +160,18 @@ class UIManager:
                  </div>
             </div>
         """)
     def create_footer(self):
         """
         Create the application footer.
         Returns:
             gr.HTML: Footer HTML component
         """
         return gr.HTML("""
             <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
                 <div style="margin-bottom: 15px;">
-                    <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Places365, Meta Llama3.2 and Ultralytics • Created with Gradio</p>
                 </div>
                 <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
                     <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
@@ -173,27 +181,27 @@ class UIManager:
                 </div>
             </div>
         """)
     def create_image_tab(self):
         """
         Create the image processing tab with all components.
         Returns:
             Dict: Dictionary containing all image tab components
         """
         components = {}
         with gr.Tab("Image Processing"):
             components['current_image_model'] = gr.State("yolov8m.pt")
             with gr.Row(equal_height=False):
                 # Left Column: Image Input & Controls
                 with gr.Column(scale=4, elem_classes="input-panel"):
                     with gr.Group():
                         gr.HTML('<div class="section-heading">Upload Image</div>')
                         components['image_input'] = gr.Image(
-                            type="pil",
-                            label="Upload an image",
                             elem_classes="upload-box"
                         )
@@ -204,7 +212,7 @@ class UIManager:
                                 label="Select Model",
                                 info="Choose speed vs. accuracy (n=fast, m=balanced, x=accurate)"
                             )
                             components['image_model_info'] = gr.Markdown(
                                 self.get_model_description("yolov8m.pt")
                             )
@@ -234,7 +242,7 @@ class UIManager:
                                     components['vehicles_btn'] = gr.Button("Vehicles", size="sm")
                                     components['animals_btn'] = gr.Button("Animals", size="sm")
                                     components['objects_btn'] = gr.Button("Common Objects", size="sm")
                                 components['image_class_filter'] = gr.Dropdown(
                                     choices=self.class_choices_formatted,
                                     multiselect=True,
@@ -243,8 +251,8 @@ class UIManager:
                                 )
                     components['image_detect_btn'] = gr.Button(
-                        "Analyze Image",
-                        variant="primary",
                         elem_classes="detect-btn"
                     )
@@ -289,21 +297,21 @@ class UIManager:
                         # Detection Result Tab
                         with gr.Tab("Detection Result"):
                             components['image_result_image'] = gr.Image(
-                                type="pil",
                                 label="Detection Result"
                             )
                             gr.HTML('<div class="section-heading">Detection Details</div>')
                             components['image_result_text'] = gr.Textbox(
-                                label=None,
-                                lines=10,
-                                elem_id="detection-details",
                                 container=False
                             )
                         # Scene Understanding Tab
                         with gr.Tab("Scene Understanding"):
                             gr.HTML('<div class="section-heading">Scene Analysis</div>')
                             # Info details
                             gr.HTML("""
                                 <details class="info-details" style="margin: 5px 0 15px 0;">
@@ -327,16 +335,16 @@ class UIManager:
                                     </p>
                                 </div>
                             ''')
                             components['image_scene_description_html'] = gr.HTML(
-                                label=None,
                                 elem_id="scene_analysis_description_text"
                             )
                             # Original Scene Analysis accordion
                             with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
                                 components['image_llm_description'] = gr.HTML(
-                                    label=None,
                                     elem_id="original_scene_description_text"
                                 )
@@ -344,32 +352,32 @@ class UIManager:
                                 with gr.Column(scale=1):
                                     gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Possible Activities</div>')
                                     components['image_activities_list'] = gr.Dataframe(
-                                        headers=["Activity"],
-                                        datatype=["str"],
-                                        row_count=5,
-                                        col_count=1,
                                         wrap=True
                                     )
                                 with gr.Column(scale=1):
                                     gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Safety Concerns</div>')
                                     components['image_safety_list'] = gr.Dataframe(
-                                        headers=["Concern"],
-                                        datatype=["str"],
-                                        row_count=5,
-                                        col_count=1,
                                         wrap=True
                                     )
                             gr.HTML('<div class="section-heading">Functional Zones</div>')
                             components['image_zones_json'] = gr.JSON(
-                                label=None,
                                 elem_classes="json-box"
                             )
                             gr.HTML('<div class="section-heading">Lighting Conditions</div>')
                             components['image_lighting_info'] = gr.JSON(
-                                label=None,
                                 elem_classes="json-box"
                             )
@@ -379,27 +387,28 @@ class UIManager:
                                 with gr.Column(scale=3, elem_classes="plot-column"):
                                     gr.HTML('<div class="section-heading">Object Distribution</div>')
                                     components['image_plot_output'] = gr.Plot(
-                                        label=None,
                                         elem_classes="large-plot-container"
                                     )
                                 with gr.Column(scale=2, elem_classes="stats-column"):
                                     gr.HTML('<div class="section-heading">Detection Statistics</div>')
                                     components['image_stats_json'] = gr.JSON(
-                                        label=None,
                                         elem_classes="enhanced-json-display"
                                     )
         return components
     def create_video_tab(self):
         """
         Create the video processing tab with all components.
         Returns:
             Dict: Dictionary containing all video tab components
         """
         components = {}
         with gr.Tab("Video Processing"):
             with gr.Row(equal_height=False):
                 # Left Column: Video Input & Controls
@@ -444,21 +453,35 @@ class UIManager:
                                 choices=self.model_choices,
                                 value="yolov8n.pt",
                                 label="Select Model (Video)",
-                                info="Faster models (like 'n') are recommended"
                             )
                             components['video_confidence'] = gr.Slider(
                                 minimum=0.1, maximum=0.9, value=0.4, step=0.05,
-                                label="Confidence Threshold (Video)"
                             )
                             components['video_process_interval'] = gr.Slider(
                                 minimum=1, maximum=60, value=10, step=1,
                                 label="Processing Interval (Frames)",
-                                info="Analyze every Nth frame (higher value = faster)"
                             )
                     components['video_process_btn'] = gr.Button(
-                        "Process Video",
-                        variant="primary",
                         elem_classes="detect-btn"
                     )
@@ -467,9 +490,17 @@ class UIManager:
                         gr.HTML('<div class="section-heading">How to Use (Video)</div>')
                         gr.Markdown("""
                         1. Choose your input method: Upload a file or enter a URL.
-                        2. Adjust settings if needed (using a faster model and larger interval is recommended for longer videos).
-                        3. Click "Process Video". **Processing can take a significant amount of time.**
-                        4. The annotated video and summary will appear on the right when finished.
                         """)
                     # Video examples
@@ -477,8 +508,9 @@ class UIManager:
                     gr.HTML("""
                         <div style="padding: 10px; background-color: #f0f7ff; border-radius: 6px; margin-bottom: 15px;">
                             <p style="font-size: 14px; color: #4A5568; margin: 0;">
-                                Upload any video containing objects that YOLO can detect. For testing, find sample videos
-                                <a href="https://www.pexels.com/search/videos/street/" target="_blank" style="color: #3182ce; text-decoration: underline;">here</a>.
                             </p>
                         </div>
                     """)
@@ -486,48 +518,87 @@ class UIManager:
                 # Right Column: Video Results
                 with gr.Column(scale=6, elem_classes="output-panel video-result-panel"):
                     gr.HTML("""
-                        <div class="section-heading">Video Result</div>
                         <details class="info-details" style="margin: 5px 0 15px 0;">
                             <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
-                                🎬 Video Processing Notes
                             </summary>
                             <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
                                 <p style="font-size: 13px; color: #718096; margin: 0;">
-                                    The processed video includes bounding boxes around detected objects. For longer videos,
-                                    consider using a faster model (like YOLOv8n) and a higher frame interval to reduce processing time.
                                 </p>
                             </div>
                         </details>
                     """)
                     components['video_output'] = gr.Video(
-                        label="Processed Video",
                         elem_classes="video-output-container"
                     )
-                    gr.HTML('<div class="section-heading">Processing Summary</div>')
-                    components['video_summary_text'] = gr.HTML(
-                        label=None,
-                        elem_id="video-summary-html-output"
-                    )
-                    gr.HTML('<div class="section-heading">Aggregated Statistics</div>')
-                    components['video_stats_json'] = gr.JSON(
-                        label=None,
-                        elem_classes="video-stats-display"
-                    )
         return components
     def get_filter_button_mappings(self):
         """
         Get the class ID mappings for filter buttons.
         Returns:
             Dict: Dictionary containing class ID lists for different categories
         """
         available_classes_list = self.get_all_classes()
         return {
             'people_classes_ids': [0],
             'vehicles_classes_ids': [1, 2, 3, 4, 5, 6, 7, 8],
@@ -535,36 +606,36 @@ class UIManager:
             'common_objects_ids': [39, 41, 42, 43, 44, 45, 56, 57, 60, 62, 63, 67, 73],
             'available_classes_list': available_classes_list
         }
-    def create_interface(self,
-                        handle_image_upload_fn,
-                        handle_video_upload_fn,
                         download_video_from_url_fn):
         """
         Create the complete Gradio interface.
         Args:
             handle_image_upload_fn: Function to handle image upload
             handle_video_upload_fn: Function to handle video upload
             download_video_from_url_fn: Function to download video from URL
         Returns:
             gr.Blocks: Complete Gradio interface
         """
         css = self.get_css_styles()
         with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
             # Header
             with gr.Group(elem_classes="app-header"):
                 self.create_header()
             # Main Content with Tabs
             with gr.Tabs(elem_classes="tabs"):
                 # Image Processing Tab
                 image_components = self.create_image_tab()
                 # Video Processing Tab
                 video_components = self.create_video_tab()
@@ -573,22 +644,22 @@ class UIManager:
             # Setup Event Listeners
             self._setup_event_listeners(
-                image_components,
-                video_components,
-                handle_image_upload_fn,
                 handle_video_upload_fn
             )
         return demo
-    def _setup_event_listeners(self,
-                              image_components,
-                              video_components,
-                              handle_image_upload_fn,
                               handle_video_upload_fn):
         """
         Setup all event listeners for the interface.
         Args:
             image_components: Dictionary of image tab components
             video_components: Dictionary of video tab components
@@ -611,73 +682,73 @@ class UIManager:
         common_objects_ids = filter_mappings['common_objects_ids']
         image_components['people_btn'].click(
-            lambda: [f"{id}: {name}" for id, name in available_classes_list if id in people_classes_ids],
             outputs=image_components['image_class_filter']
         )
         image_components['vehicles_btn'].click(
-            lambda: [f"{id}: {name}" for id, name in available_classes_list if id in vehicles_classes_ids],
             outputs=image_components['image_class_filter']
         )
         image_components['animals_btn'].click(
-            lambda: [f"{id}: {name}" for id, name in available_classes_list if id in animals_classes_ids],
             outputs=image_components['image_class_filter']
         )
         image_components['objects_btn'].click(
-            lambda: [f"{id}: {name}" for id, name in available_classes_list if id in common_objects_ids],
             outputs=image_components['image_class_filter']
         )
         # Video Input Type Change Handler
         video_components['video_input_type'].change(
-            fn=lambda input_type: [
-                # Show/hide file upload
-                gr.update(visible=(input_type == "upload")),
-                # Show/hide URL input
-                gr.update(visible=(input_type == "url"))
-            ],
-            inputs=[video_components['video_input_type']],
-            outputs=[video_components['video_input'], video_components['video_url_input']]
         )
         # Image Detect Button Click Handler
         image_components['image_detect_btn'].click(
             fn=handle_image_upload_fn,
             inputs=[
-                image_components['image_input'],
-                image_components['image_model_dropdown'],
-                image_components['image_confidence'],
-                image_components['image_class_filter'],
-                image_components['use_llm'],
                 image_components['use_landmark_detection']
             ],
             outputs=[
-                image_components['image_result_image'],
-                image_components['image_result_text'],
-                image_components['image_stats_json'],
                 image_components['image_plot_output'],
-                image_components['image_scene_description_html'],
-                image_components['image_llm_description'],
-                image_components['image_activities_list'],
-                image_components['image_safety_list'],
                 image_components['image_zones_json'],
                 image_components['image_lighting_info']
             ]
         )
-        # Video Process Button Click Handler
         video_components['video_process_btn'].click(
-            fn=handle_video_upload_fn,
-            inputs=[
-                video_components['video_input'],
-                video_components['video_url_input'],
-                video_components['video_input_type'],
-                video_components['video_model_dropdown'],
-                video_components['video_confidence'],
-                video_components['video_process_interval']
             ],
-            outputs=[
-                video_components['video_output'],
-                video_components['video_summary_text'],
-                video_components['video_stats_json']
             ]
         )

 class UIManager:
     """
+    Manages all UI-related functionality
     Handles Gradio interface creation, component definitions, and event binding.
     """
     def __init__(self):
         """Initialize the UI Manager."""
         self.available_models = None
         self.model_choices = []
         self.class_choices_formatted = []
         self._setup_model_choices()
     def _setup_model_choices(self):
         """Setup model choices for dropdowns."""
         try:
         except ImportError:
             # Fallback model choices if DetectionModel is not available
             self.model_choices = ["yolov8n.pt", "yolov8s.pt", "yolov8m.pt", "yolov8l.pt", "yolov8x.pt"]
         # Setup class choices
         self.class_choices_formatted = [f"{id}: {name}" for id, name in self.get_all_classes()]
     def get_all_classes(self):
         """
         Gets all available COCO classes.
         Returns:
             List[Tuple[int, str]]: List of (class_id, class_name) tuples
         """
         except Exception:
             pass
+        # COCO Classes
         default_classes = {
             0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
             6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
             77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
         }
         return sorted(default_classes.items())
     def set_image_processor(self, image_processor):
         """
         Set the image processor reference for dynamic class retrieval.
         Args:
             image_processor: The ImageProcessor instance
         """
         self._image_processor = image_processor
     def get_css_styles(self):
         """
         Get CSS styles for the interface.
         Returns:
             str: CSS styles
         """
         try:
             return Style.get_css()
         except ImportError:
+            # fallback defualt CSS style
             return """
             .app-header {
                 text-align: center;
                 border: none !important;
                 border-radius: 8px !important;
             }
+            .video-summary-content-wrapper {
+                max-height: 400px;
+                overflow-y: auto;
+                background-color: #f8f9fa;
+                border-radius: 8px;
+                padding: 15px;
+                border: 1px solid #e2e8f0;
+            }
             """
     def get_model_description(self, model_name):
         """
         Get model description for the given model name.
         Args:
             model_name: Name of the model
         Returns:
             str: Model description
         """
             return DetectionModel.get_model_description(model_name)
         except ImportError:
             return f"Model: {model_name}"
     def create_header(self):
         """
         Create the application header.
         Returns:
             gr.HTML: Header HTML component
         """
                 <div style="display: flex; justify-content: center; gap: 10px; margin: 0.5rem 0;"><div style="height: 3px; width: 80px; background: linear-gradient(90deg, #38b2ac, #4299e1);"></div></div>
                 <div style="display: flex; justify-content: center; gap: 25px; margin-top: 1.5rem;">
                     <div style="padding: 8px 15px; border-radius: 20px; background: rgba(66, 153, 225, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🖼️</span> Image Analysis</div>
+                    <div style="padding: 8px 15px; border-radius: 20px; background: rgba(56, 178, 172, 0.15); color: #2b6cb0; font-weight: 500; font-size: 0.9rem;"><span style="margin-right: 6px;">🎬</span> Video Analysis with Temporal Tracking</div>
                 </div>
                  <div style="margin-top: 20px; padding: 10px 15px; background-color: rgba(255, 248, 230, 0.9); border-left: 3px solid #f6ad55; border-radius: 6px; max-width: 600px; margin-left: auto; margin-right: auto; text-align: left;">
                      <p style="margin: 0; font-size: 0.9rem; color: #805ad5; font-weight: 500;">
                  </div>
             </div>
         """)
     def create_footer(self):
         """
         Create the application footer.
         Returns:
             gr.HTML: Footer HTML component
         """
         return gr.HTML("""
             <div class="footer" style="padding: 25px 0; text-align: center; background: linear-gradient(to right, #f5f9fc, #e1f5fe); border-top: 1px solid #e2e8f0; margin-top: 30px;">
                 <div style="margin-bottom: 15px;">
+                    <p style="font-size: 14px; color: #4A5568; margin: 5px 0;">Powered by YOLOv8, CLIP, Places365, Meta Llama3.2 and Ultralytics • Enhanced Video Processing with Temporal Analysis • Created with Gradio</p>
                 </div>
                 <div style="display: flex; align-items: center; justify-content: center; gap: 20px; margin-top: 15px;">
                     <p style="font-family: 'Arial', sans-serif; font-size: 14px; font-weight: 500; letter-spacing: 2px; background: linear-gradient(90deg, #38b2ac, #4299e1); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0; text-transform: uppercase; display: inline-block;">EXPLORE THE CODE →</p>
                 </div>
             </div>
         """)
     def create_image_tab(self):
         """
         Create the image processing tab with all components.
         Returns:
             Dict: Dictionary containing all image tab components
         """
         components = {}
         with gr.Tab("Image Processing"):
             components['current_image_model'] = gr.State("yolov8m.pt")
             with gr.Row(equal_height=False):
                 # Left Column: Image Input & Controls
                 with gr.Column(scale=4, elem_classes="input-panel"):
                     with gr.Group():
                         gr.HTML('<div class="section-heading">Upload Image</div>')
                         components['image_input'] = gr.Image(
+                            type="pil",
+                            label="Upload an image",
                             elem_classes="upload-box"
                         )
                                 label="Select Model",
                                 info="Choose speed vs. accuracy (n=fast, m=balanced, x=accurate)"
                             )
                             components['image_model_info'] = gr.Markdown(
                                 self.get_model_description("yolov8m.pt")
                             )
                                     components['vehicles_btn'] = gr.Button("Vehicles", size="sm")
                                     components['animals_btn'] = gr.Button("Animals", size="sm")
                                     components['objects_btn'] = gr.Button("Common Objects", size="sm")
                                 components['image_class_filter'] = gr.Dropdown(
                                     choices=self.class_choices_formatted,
                                     multiselect=True,
                                 )
                     components['image_detect_btn'] = gr.Button(
+                        "Analyze Image",
+                        variant="primary",
                         elem_classes="detect-btn"
                     )
                         # Detection Result Tab
                         with gr.Tab("Detection Result"):
                             components['image_result_image'] = gr.Image(
+                                type="pil",
                                 label="Detection Result"
                             )
                             gr.HTML('<div class="section-heading">Detection Details</div>')
                             components['image_result_text'] = gr.Textbox(
+                                label=None,
+                                lines=10,
+                                elem_id="detection-details",
                                 container=False
                             )
                         # Scene Understanding Tab
                         with gr.Tab("Scene Understanding"):
                             gr.HTML('<div class="section-heading">Scene Analysis</div>')
                             # Info details
                             gr.HTML("""
                                 <details class="info-details" style="margin: 5px 0 15px 0;">
                                     </p>
                                 </div>
                             ''')
                             components['image_scene_description_html'] = gr.HTML(
+                                label=None,
                                 elem_id="scene_analysis_description_text"
                             )
                             # Original Scene Analysis accordion
                             with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
                                 components['image_llm_description'] = gr.HTML(
+                                    label=None,
                                     elem_id="original_scene_description_text"
                                 )
                                 with gr.Column(scale=1):
                                     gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Possible Activities</div>')
                                     components['image_activities_list'] = gr.Dataframe(
+                                        headers=["Activity"],
+                                        datatype=["str"],
+                                        row_count=5,
+                                        col_count=1,
                                         wrap=True
                                     )
                                 with gr.Column(scale=1):
                                     gr.HTML('<div class="section-heading" style="font-size:1rem; text-align:left;">Safety Concerns</div>')
                                     components['image_safety_list'] = gr.Dataframe(
+                                        headers=["Concern"],
+                                        datatype=["str"],
+                                        row_count=5,
+                                        col_count=1,
                                         wrap=True
                                     )
                             gr.HTML('<div class="section-heading">Functional Zones</div>')
                             components['image_zones_json'] = gr.JSON(
+                                label=None,
                                 elem_classes="json-box"
                             )
                             gr.HTML('<div class="section-heading">Lighting Conditions</div>')
                             components['image_lighting_info'] = gr.JSON(
+                                label=None,
                                 elem_classes="json-box"
                             )
                                 with gr.Column(scale=3, elem_classes="plot-column"):
                                     gr.HTML('<div class="section-heading">Object Distribution</div>')
                                     components['image_plot_output'] = gr.Plot(
+                                        label=None,
                                         elem_classes="large-plot-container"
                                     )
                                 with gr.Column(scale=2, elem_classes="stats-column"):
                                     gr.HTML('<div class="section-heading">Detection Statistics</div>')
                                     components['image_stats_json'] = gr.JSON(
+                                        label=None,
                                         elem_classes="enhanced-json-display"
                                     )
         return components
     def create_video_tab(self):
         """
         Create the video processing tab with all components.
+        注意：移除了複雜的時序分析控制項，簡化為基本的統計分析
         Returns:
             Dict: Dictionary containing all video tab components
         """
         components = {}
         with gr.Tab("Video Processing"):
             with gr.Row(equal_height=False):
                 # Left Column: Video Input & Controls
                                 choices=self.model_choices,
                                 value="yolov8n.pt",
                                 label="Select Model (Video)",
+                                info="Faster models (like 'n') are recommended for video processing"
                             )
                             components['video_confidence'] = gr.Slider(
                                 minimum=0.1, maximum=0.9, value=0.4, step=0.05,
+                                label="Confidence Threshold (Video)",
+                                info="Higher threshold reduces false detections"
                             )
                             components['video_process_interval'] = gr.Slider(
                                 minimum=1, maximum=60, value=10, step=1,
                                 label="Processing Interval (Frames)",
+                                info="Analyze every Nth frame (higher value = faster processing)"
                             )
+                            # 簡化的分析說明
+                            gr.HTML("""
+                                <div style="padding: 8px; margin-top: 10px; background-color: #f0f7ff; border-radius: 4px; border-left: 3px solid #4299e1; font-size: 12px;">
+                                    <p style="margin: 0; color: #4a5568;">
+                                        <b>Analysis Features:</b><br>
+                                        • Accurate object counting with duplicate detection removal<br>
+                                        • Timeline analysis showing when objects first appear<br>
+                                        • Duration tracking for object presence in video<br>
+                                        • Simple, clear statistical summaries
+                                    </p>
+                                </div>
+                            """)
                     components['video_process_btn'] = gr.Button(
+                        "Analyze Video",
+                        variant="primary",
                         elem_classes="detect-btn"
                     )
                         gr.HTML('<div class="section-heading">How to Use (Video)</div>')
                         gr.Markdown("""
                         1. Choose your input method: Upload a file or enter a URL.
+                        2. Adjust settings if needed:
+                            * Use **faster models** (yolov8n) for quicker processing
+                            * Set **larger intervals** (15+ frames) for longer videos
+                            * Adjust **confidence threshold** to filter low-quality detections
+                        3. Click "Analyze Video". **Processing time varies based on video length.**
+                        4. Review the results: annotated video and statistical analysis.
+                        **⚡ Performance Tips:**
+                        * For videos longer than 2 minutes, use interval ≥ 15 frames
+                        * YOLOv8n model provides best speed for video processing
+                        * Higher confidence thresholds reduce processing noise
                         """)
                     # Video examples
                     gr.HTML("""
                         <div style="padding: 10px; background-color: #f0f7ff; border-radius: 6px; margin-bottom: 15px;">
                             <p style="font-size: 14px; color: #4A5568; margin: 0;">
+                                Upload any video containing objects that YOLO can detect. For testing, find sample videos from
+                                <a href="https://www.pexels.com/search/videos/street/" target="_blank" style="color: #3182ce; text-decoration: underline;">Pexels</a> or
+                                <a href="https://www.youtube.com/results?search_query=traffic+camera+footage" target="_blank" style="color: #3182ce; text-decoration: underline;">YouTube traffic footage</a>.
                             </p>
                         </div>
                     """)
                 # Right Column: Video Results
                 with gr.Column(scale=6, elem_classes="output-panel video-result-panel"):
                     gr.HTML("""
+                        <div class="section-heading">Video Analysis Results</div>
                         <details class="info-details" style="margin: 5px 0 15px 0;">
                             <summary style="padding: 8px; background-color: #f0f7ff; border-radius: 6px; border-left: 3px solid #4299e1; font-weight: bold; cursor: pointer; color: #2b6cb0;">
+                                🎬 Simplified Video Analysis Features
                             </summary>
                             <div style="margin-top: 8px; padding: 10px; background-color: #f8f9fa; border-radius: 6px; border: 1px solid #e2e8f0;">
                                 <p style="font-size: 13px; color: #718096; margin: 0;">
+                                    <b>Focus on practical insights:</b> This analysis provides accurate object counts and timing information
+                                    without complex tracking. The system uses spatial clustering to eliminate duplicate detections and
+                                    provides clear timeline data showing when objects first appear and how long they remain visible.
+                                    <br><br>
+                                    <b>Key benefits:</b> Reliable object counting, clear timeline analysis, and easy-to-understand results
+                                    that directly answer questions like "How many cars are in this video?" and "When do they appear?"
                                 </p>
                             </div>
                         </details>
                     """)
                     components['video_output'] = gr.Video(
+                        label="Analyzed Video with Object Detection",
                         elem_classes="video-output-container"
                     )
+                    with gr.Tabs(elem_classes="video-results-tabs"):
+                        # Analysis Summary Tab
+                        with gr.Tab("Analysis Summary"):
+                            gr.HTML('<div class="section-heading">Video Analysis Report</div>')
+                            gr.HTML("""
+                                <div style="margin-bottom: 10px; padding: 8px; background-color: #f0f9ff; border-radius: 4px; border-left: 3px solid #4299e1; font-size: 12px;">
+                                    <p style="margin: 0; color: #4a5568;">
+                                        This summary provides object counts, timeline information, and insights about what appears in your video.
+                                        Results are based on spatial clustering analysis to ensure accurate counting.
+                                    </p>
+                                </div>
+                            """)
+                            components['video_summary_text'] = gr.HTML(
+                                label=None,
+                                elem_id="video-summary-html-output"
+                            )
+                        # Detailed Statistics Tab
+                        with gr.Tab("Detailed Statistics"):
+                            gr.HTML('<div class="section-heading">Complete Analysis Data</div>')
+                            with gr.Accordion("Processing Information", open=True):
+                                gr.HTML("""
+                                    <div style="padding: 6px; background-color: #f8f9fa; border-radius: 4px; margin-bottom: 10px; font-size: 12px;">
+                                        <p style="margin: 0; color: #4a5568;">
+                                            Basic information about video processing parameters and performance.
+                                        </p>
+                                    </div>
+                                """)
+                                components['video_stats_json'] = gr.JSON(
+                                    label=None,
+                                    elem_classes="video-stats-display"
+                                )
+                            with gr.Accordion("Object Details", open=False):
+                                gr.HTML("""
+                                    <div style="padding: 6px; background-color: #f8f9fa; border-radius: 4px; margin-bottom: 10px; font-size: 12px;">
+                                        <p style="margin: 0; color: #4a5568;">
+                                            Detailed breakdown of each object type detected, including timing and confidence information.
+                                        </p>
+                                    </div>
+                                """)
+                                components['video_object_details'] = gr.JSON(
+                                    label="Object-by-Object Analysis",
+                                    elem_classes="object-details-display"
+                                )
         return components
     def get_filter_button_mappings(self):
         """
         Get the class ID mappings for filter buttons.
         Returns:
             Dict: Dictionary containing class ID lists for different categories
         """
         available_classes_list = self.get_all_classes()
         return {
             'people_classes_ids': [0],
             'vehicles_classes_ids': [1, 2, 3, 4, 5, 6, 7, 8],
             'common_objects_ids': [39, 41, 42, 43, 44, 45, 56, 57, 60, 62, 63, 67, 73],
             'available_classes_list': available_classes_list
         }
+    def create_interface(self,
+                        handle_image_upload_fn,
+                        handle_video_upload_fn,
                         download_video_from_url_fn):
         """
         Create the complete Gradio interface.
         Args:
             handle_image_upload_fn: Function to handle image upload
             handle_video_upload_fn: Function to handle video upload
             download_video_from_url_fn: Function to download video from URL
         Returns:
             gr.Blocks: Complete Gradio interface
         """
         css = self.get_css_styles()
         with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
             # Header
             with gr.Group(elem_classes="app-header"):
                 self.create_header()
             # Main Content with Tabs
             with gr.Tabs(elem_classes="tabs"):
                 # Image Processing Tab
                 image_components = self.create_image_tab()
                 # Video Processing Tab
                 video_components = self.create_video_tab()
             # Setup Event Listeners
             self._setup_event_listeners(
+                image_components,
+                video_components,
+                handle_image_upload_fn,
                 handle_video_upload_fn
             )
         return demo
+    def _setup_event_listeners(self,
+                              image_components,
+                              video_components,
+                              handle_image_upload_fn,
                               handle_video_upload_fn):
         """
         Setup all event listeners for the interface.
         Args:
             image_components: Dictionary of image tab components
             video_components: Dictionary of video tab components
         common_objects_ids = filter_mappings['common_objects_ids']
         image_components['people_btn'].click(
+            lambda: [f"{id}: {name}" for id, name in available_classes_list if id in people_classes_ids],
             outputs=image_components['image_class_filter']
         )
         image_components['vehicles_btn'].click(
+            lambda: [f"{id}: {name}" for id, name in available_classes_list if id in vehicles_classes_ids],
             outputs=image_components['image_class_filter']
         )
         image_components['animals_btn'].click(
+            lambda: [f"{id}: {name}" for id, name in available_classes_list if id in animals_classes_ids],
             outputs=image_components['image_class_filter']
         )
         image_components['objects_btn'].click(
+            lambda: [f"{id}: {name}" for id, name in available_classes_list if id in common_objects_ids],
             outputs=image_components['image_class_filter']
         )
         # Video Input Type Change Handler
         video_components['video_input_type'].change(
+        fn=lambda input_type: [
+            # Show/hide file upload
+            gr.update(visible=(input_type == "upload")),
+            # Show/hide URL input
+            gr.update(visible=(input_type == "url"))
+        ],
+        inputs=[video_components['video_input_type']],
+        outputs=[video_components['video_input'], video_components['video_url_input']]
         )
         # Image Detect Button Click Handler
         image_components['image_detect_btn'].click(
             fn=handle_image_upload_fn,
             inputs=[
+                image_components['image_input'],
+                image_components['image_model_dropdown'],
+                image_components['image_confidence'],
+                image_components['image_class_filter'],
+                image_components['use_llm'],
                 image_components['use_landmark_detection']
             ],
             outputs=[
+                image_components['image_result_image'],
+                image_components['image_result_text'],
+                image_components['image_stats_json'],
                 image_components['image_plot_output'],
+                image_components['image_scene_description_html'],
+                image_components['image_llm_description'],
+                image_components['image_activities_list'],
+                image_components['image_safety_list'],
                 image_components['image_zones_json'],
                 image_components['image_lighting_info']
             ]
         )
+        # Video Process Button Click Handler
         video_components['video_process_btn'].click(
+        fn=handle_video_upload_fn,
+        inputs=[
+            video_components['video_input'],
+            video_components['video_url_input'],
+            video_components['video_input_type'],
+            video_components['video_model_dropdown'],
+            video_components['video_confidence'],
+            video_components['video_process_interval']
             ],
+        outputs=[
+            video_components['video_output'],
+            video_components['video_summary_text'],
+            video_components['video_stats_json']
             ]
         )

video_processor.py CHANGED Viewed

@@ -2,345 +2,550 @@ import cv2
 import os
 import tempfile
 import uuid
-from PIL import Image
 import numpy as np
 from typing import Dict, List, Tuple, Any, Optional
-import time
 from collections import defaultdict
-from image_processor import ImageProcessor
-from evaluation_metrics import EvaluationMetrics
-from scene_analyzer import SceneAnalyzer
 from detection_model import DetectionModel
 class VideoProcessor:
     """
-    Handles the processing of video files, including object detection
-    and scene analysis on selected frames.
     """
-    def __init__(self, image_processor: ImageProcessor):
-        """
-        Initializes the VideoProcessor.
-        Args:
-            image_processor (ImageProcessor): An initialized ImageProcessor instance.
-        """
-        self.image_processor = image_processor
-    def process_video_file(self,
-                           video_path: str,
-                           model_name: str,
-                           confidence_threshold: float,
-                           process_interval: int = 5,
-                           scene_desc_interval_sec: int = 3) -> Tuple[Optional[str], str, Dict]:
         """
-        Processes an uploaded video file, performs detection and periodic scene analysis,
-        and returns the path to the annotated output video file along with a summary.
         Args:
-            video_path (str): Path to the input video file.
-            model_name (str): Name of the YOLO model to use.
-            confidence_threshold (float): Confidence threshold for object detection.
-            process_interval (int): Process every Nth frame. Defaults to 5.
-            scene_desc_interval_sec (int): Update scene description every N seconds. Defaults to 3.
         Returns:
-            Tuple[Optional[str], str, Dict]: (Path to output video or None, Summary text, Statistics dictionary)
         """
         if not video_path or not os.path.exists(video_path):
             print(f"Error: Video file not found at {video_path}")
-            return None, "Error: Video file not found.", {}
-        print(f"Starting video processing for: {video_path}")
         start_time = time.time()
         cap = cv2.VideoCapture(video_path)
         if not cap.isOpened():
-            print(f"Error: Could not open video file {video_path}")
-            return None, "Error opening video file.", {}
-        # Get video properties
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        if fps <= 0: # Handle case where fps is not available or invalid
-             fps = 30 # Assume a default fps
-             print(f"Warning: Could not get valid FPS for video. Assuming {fps} FPS.")
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        total_frames_video = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        print(f"Video properties: {width}x{height} @ {fps:.2f} FPS, Total Frames: {total_frames_video}")
-        # Calculate description update interval in frames
-        description_update_interval_frames = int(fps * scene_desc_interval_sec)
-        if description_update_interval_frames < 1:
-            description_update_interval_frames = int(fps) # Update at least once per second if interval is too short
-        object_trackers = {}  # 儲存ID與物體的映射
-        last_detected_objects = {}  # 儲存上一次檢測到的物體資訊
-        next_object_id = 0  # 下一個可用的物體ID
-        tracking_threshold = 0.6  # 相同物體的IoU
-        object_colors = {}  # 每個被追蹤的物體分配固定顏色
-        # Setup Output Video
-        output_filename = f"processed_{uuid.uuid4().hex}_{os.path.basename(video_path)}"
-        temp_dir = tempfile.gettempdir() # Use system's temp directory
         output_path = os.path.join(temp_dir, output_filename)
-        # Ensure the output path has a compatible extension (like .mp4)
         if not output_path.lower().endswith(('.mp4', '.avi', '.mov')):
             output_path += ".mp4"
-        # Use 'mp4v' for MP4, common and well-supported
         fourcc = cv2.VideoWriter_fourcc(*'mp4v')
         out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
         if not out.isOpened():
-            print(f"Error: Could not open VideoWriter for path: {output_path}")
             cap.release()
-            return None, f"Error creating output video file at {output_path}.", {}
         print(f"Output video will be saved to: {output_path}")
         frame_count = 0
         processed_frame_count = 0
-        all_stats = [] # Store stats for each processed frame
-        summary_lines = []
-        last_description = "Analyzing scene..." # Initial description
-        frame_since_last_desc = description_update_interval_frames # Trigger analysis on first processed frame
         try:
             while True:
                 ret, frame = cap.read()
                 if not ret:
-                    break # End of video
                 frame_count += 1
-                frame_since_last_desc += 1
-                current_frame_annotated = False # Flag if this frame was processed and annotated
-                # Process frame based on interval
                 if frame_count % process_interval == 0:
                     processed_frame_count += 1
-                    print(f"Processing frame {frame_count}...")
-                    current_frame_annotated = True
-                    # Use ImageProcessor for single-frame tasks
-                    # 1. Convert frame format BGR -> RGB -> PIL
                     try:
                         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                         pil_image = Image.fromarray(frame_rgb)
                     except Exception as e:
-                        print(f"Error converting frame {frame_count}: {e}")
-                        continue # Skip this frame
-                    # 2. Get appropriate model instance
-                    # Confidence is passed from UI, model_name too
-                    model_instance = self.image_processor.get_model_instance(model_name, confidence_threshold)
-                    if not model_instance or not model_instance.is_model_loaded:
-                         print(f"Error: Model {model_name} not loaded. Skipping frame {frame_count}.")
-                         # Draw basic frame without annotation
-                         cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA)
-                         cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA)
-                         out.write(frame)
-                         continue
-                    # 3. Perform detection
-                    detection_result = model_instance.detect(pil_image) # Use PIL image
-                    current_description_for_frame = last_description # Default to last known description
-                    scene_analysis_result = None
-                    stats = {}
-                    if detection_result and hasattr(detection_result, 'boxes') and len(detection_result.boxes) > 0:
-                        # Ensure SceneAnalyzer is ready within ImageProcessor
-                        if not hasattr(self.image_processor, 'scene_analyzer') or self.image_processor.scene_analyzer is None:
-                             print("Initializing SceneAnalyzer...")
-                             # Pass class names from the current detection result
-                             self.image_processor.scene_analyzer = SceneAnalyzer(class_names=detection_result.names)
-                        elif self.image_processor.scene_analyzer.class_names is None:
-                             # Update class names if they were missing
-                             self.image_processor.scene_analyzer.class_names = detection_result.names
-                             if hasattr(self.image_processor.scene_analyzer, 'spatial_analyzer'):
-                                 self.image_processor.scene_analyzer.spatial_analyzer.class_names = detection_result.names
-                        # 4. Perform Scene Analysis (periodically)
-                        if frame_since_last_desc >= description_update_interval_frames:
-                            print(f"Analyzing scene at frame {frame_count} (threshold: {description_update_interval_frames} frames)...")
-                            # Pass lighting_info=None for now, as it's disabled for performance
-                            scene_analysis_result = self.image_processor.analyze_scene(detection_result, lighting_info=None)
-                            current_description_for_frame = scene_analysis_result.get("description", last_description)
-                            last_description = current_description_for_frame # Cache the new description
-                            frame_since_last_desc = 0 # Reset counter
-                        # 5. Calculate Statistics for this frame
-                        stats = EvaluationMetrics.calculate_basic_stats(detection_result)
-                        stats['frame_number'] = frame_count # Add frame number to stats
-                        all_stats.append(stats)
-                        # 6. Draw annotations
-                        names = detection_result.names
-                        boxes = detection_result.boxes.xyxy.cpu().numpy()
-                        classes = detection_result.boxes.cls.cpu().numpy().astype(int)
-                        confs = detection_result.boxes.conf.cpu().numpy()
-                        def calculate_iou(box1, box2):
-                            """Calculate Intersection IOU value"""
-                            x1_1, y1_1, x2_1, y2_1 = box1
-                            x1_2, y1_2, x2_2, y2_2 = box2
-                            xi1 = max(x1_1, x1_2)
-                            yi1 = max(y1_1, y1_2)
-                            xi2 = min(x2_1, x2_2)
-                            yi2 = min(y2_1, y2_2)
-                            inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
-                            box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
-                            box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
-                            union_area = box1_area + box2_area - inter_area
-                            return inter_area / union_area if union_area > 0 else 0
-                        # 處理當前幀中的所有檢測
-                        current_detected_objects = {}
-                        for box, cls_id, conf in zip(boxes, classes, confs):
-                            x1, y1, x2, y2 = map(int, box)
-                            # 查找最匹配的已追蹤物體
-                            best_match_id = None
-                            best_match_iou = 0
-                            for obj_id, (old_box, old_cls_id, _) in last_detected_objects.items():
-                                if old_cls_id == cls_id:  # 同一類別才比較
-                                    iou = calculate_iou(box, old_box)
-                                    if iou > tracking_threshold and iou > best_match_iou:
-                                        best_match_id = obj_id
-                                        best_match_iou = iou
-                            # 如果找到匹配，使用現有ID；否則分配新ID
-                            if best_match_id is not None:
-                                obj_id = best_match_id
-                            else:
-                                obj_id = next_object_id
-                                next_object_id += 1
-                                # 使用更明顯的顏色
-                                bright_colors = [
-                                    (0, 0, 255),    # red
-                                    (0, 255, 0),    # green
-                                    (255, 0, 0),    # blue
-                                    (0, 255, 255),  # yellow
-                                    (255, 0, 255),  # purple
-                                    (255, 128, 0),  # orange
-                                    (128, 0, 255)   # purple
-                                ]
-                                object_colors[obj_id] = bright_colors[obj_id % len(bright_colors)]
-                            # update tracking info
-                            current_detected_objects[obj_id] = (box, cls_id, conf)
-                            color = object_colors.get(obj_id, (0, 255, 0))  # default is green
-                            label = f"{names.get(cls_id, 'Unknown')}-{obj_id}: {conf:.2f}"
-                            # 平滑化邊界框：如果是已知物體，與上一幀位置平均
-                            if obj_id in last_detected_objects:
-                                old_box, _, _ = last_detected_objects[obj_id]
-                                old_x1, old_y1, old_x2, old_y2 = map(int, old_box)
-                                # 平滑係數
-                                alpha = 0.7  # current weight
-                                beta = 0.3   # history weight
-                                x1 = int(alpha * x1 + beta * old_x1)
-                                y1 = int(alpha * y1 + beta * old_y1)
-                                x2 = int(alpha * x2 + beta * old_x2)
-                                y2 = int(alpha * y2 + beta * old_y2)
-                            # draw box and label
-                            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
-                            # add text
-                            (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
-                            cv2.rectangle(frame, (x1, y1 - h - 10), (x1 + w, y1 - 10), color, -1)
-                            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)
-                        # update tracking info
-                        last_detected_objects = current_detected_objects.copy()
-                    # Draw the current scene description on the frame
-                    cv2.putText(frame, f"Scene: {current_description_for_frame[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA) # Black outline
-                    cv2.putText(frame, f"Scene: {current_description_for_frame[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA) # White text
-                # Write the frame (annotated or original) to the output video
-                # Draw last known description if this frame wasn't processed
-                if not current_frame_annotated:
-                    cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 3, cv2.LINE_AA)
-                    cv2.putText(frame, f"Scene: {last_description[:80]}...", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA)
-                out.write(frame) # Write frame to output file
         except Exception as e:
-            print(f"Error during video processing loop for {video_path}: {e}")
-            import traceback
             traceback.print_exc()
-            summary_lines.append(f"An error occurred during processing: {e}")
         finally:
-            # Release resources
             cap.release()
             out.release()
-            print(f"Video processing finished. Resources released. Output path: {output_path}")
-            if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
-                print(f"Error: Output video file was not created or is empty at {output_path}")
-                summary_lines.append("Error: Failed to create output video.")
-                output_path = None
-        end_time = time.time()
-        processing_time = end_time - start_time
-        summary_lines.insert(0, f"Finished processing in {processing_time:.2f} seconds.")
-        summary_lines.insert(1, f"Processed {processed_frame_count} frames out of {frame_count} (interval: {process_interval} frames).")
-        summary_lines.insert(2, f"Scene description updated approximately every {scene_desc_interval_sec} seconds.")
-        # Generate Aggregate Statistics
-        aggregated_stats = {
-            "total_frames_read": frame_count,
-            "total_frames_processed": processed_frame_count,
-            "avg_objects_per_processed_frame": 0, # Calculate below
-            "cumulative_detections": {}, # Total times each class was detected
-            "max_concurrent_detections": {} # Max count of each class in a single processed frame
             }
-        object_cumulative_counts = {}
-        object_max_concurrent_counts = {} # Store the max count found for each object type
-        total_detected_in_processed = 0
-        # Iterate through stats collected from each processed frame
-        for frame_stats in all_stats:
-            total_objects_in_frame = frame_stats.get("total_objects", 0)
-            total_detected_in_processed += total_objects_in_frame
-            # Iterate through object classes detected in this frame
-            for obj_name, obj_data in frame_stats.get("class_statistics", {}).items():
-                count_in_frame = obj_data.get("count", 0)
-                # Cumulative count
-                if obj_name not in object_cumulative_counts:
-                    object_cumulative_counts[obj_name] = 0
-                object_cumulative_counts[obj_name] += count_in_frame
-                # Max concurrent count
-                if obj_name not in object_max_concurrent_counts:
-                    object_max_concurrent_counts[obj_name] = 0
-                # Update the max count if the current frame's count is higher
-                object_max_concurrent_counts[obj_name] = max(object_max_concurrent_counts[obj_name], count_in_frame)
-        # Add sorted results to the final dictionary
-        aggregated_stats["cumulative_detections"] = dict(sorted(object_cumulative_counts.items(), key=lambda item: item[1], reverse=True))
-        aggregated_stats["max_concurrent_detections"] = dict(sorted(object_max_concurrent_counts.items(), key=lambda item: item[1], reverse=True))
-        # Calculate average objects per processed frame
-        if processed_frame_count > 0:
-             aggregated_stats["avg_objects_per_processed_frame"] = round(total_detected_in_processed / processed_frame_count, 2)
-        summary_text = "\n".join(summary_lines)
-        print("Generated Summary:\n", summary_text)
-        print("Aggregated Stats (Revised):\n", aggregated_stats) # Print the revised stats
-        # Return the potentially updated output_path
-        return output_path, summary_text, aggregated_stats

 import os
 import tempfile
 import uuid
+import time
+import traceback
 import numpy as np
+from PIL import Image
 from typing import Dict, List, Tuple, Any, Optional
 from collections import defaultdict
+from dataclasses import dataclass
+import math
 from detection_model import DetectionModel
+from evaluation_metrics import EvaluationMetrics
+@dataclass
+class ObjectRecord:
+    """物體記錄數據結構"""
+    class_name: str
+    first_seen_time: float
+    last_seen_time: float
+    total_detections: int
+    peak_count_in_frame: int
+    confidence_avg: float
+    def get_duration(self) -> float:
+        """獲取物體在影片中的持續時間"""
+        return self.last_seen_time - self.first_seen_time
+    def format_time(self, seconds: float) -> str:
+        """格式化時間顯示"""
+        minutes = int(seconds // 60)
+        secs = int(seconds % 60)
+        if minutes > 0:
+            return f"{minutes}m{secs:02d}s"
+        return f"{secs}s"
 class VideoProcessor:
     """
+    專注於實用統計分析的視頻處理器：
+    - 準確的物體計數和識別
+    - 物體出現時間分析
+    - 檢測品質評估
+    - 活動密度統計
     """
+    def __init__(self):
+        """初始化視頻處理器"""
+        self.detection_models: Dict[str, DetectionModel] = {}
+        # 分析參數
+        self.spatial_cluster_threshold = 100  # 像素距離閾值，用於合併重複檢測
+        self.confidence_filter_threshold = 0.1  # 最低信心度過濾
+        # 統計數據收集
+        self.frame_detections = []  # 每幀檢測結果
+        self.object_timeline = defaultdict(list)  # 物體時間線記錄
+        self.frame_timestamps = []  # 幀時間戳記錄
+    def get_or_create_model(self, model_name: str, confidence_threshold: float) -> DetectionModel:
+        """獲取或創建檢測模型實例"""
+        model_key = f"{model_name}_{confidence_threshold}"
+        if model_key not in self.detection_models:
+            try:
+                model = DetectionModel(model_name, confidence_threshold)
+                self.detection_models[model_key] = model
+                print(f"Loaded detection model: {model_name} with confidence {confidence_threshold}")
+            except Exception as e:
+                print(f"Error loading model {model_name}: {e}")
+                raise
+        return self.detection_models[model_key]
+    def cluster_detections_by_position(self, detections: List[Dict], threshold: float = 100) -> List[Dict]:
+        """根據位置聚類檢測結果，合併相近的重複檢測"""
+        if not detections:
+            return []
+        # 按物體類別分組進行聚類處理
+        class_groups = defaultdict(list)
+        for det in detections:
+            class_groups[det['class_name']].append(det)
+        clustered_results = []
+        for class_name, class_detections in class_groups.items():
+            if len(class_detections) == 1:
+                clustered_results.extend(class_detections)
+                continue
+            # 執行空間聚類算法
+            clusters = []
+            used = set()
+            for i, det1 in enumerate(class_detections):
+                if i in used:
+                    continue
+                cluster = [det1]
+                used.add(i)
+                # 計算檢測框中心點
+                x1_center = (det1['bbox'][0] + det1['bbox'][2]) / 2
+                y1_center = (det1['bbox'][1] + det1['bbox'][3]) / 2
+                # 查找相近的檢測結果
+                for j, det2 in enumerate(class_detections):
+                    if j in used:
+                        continue
+                    x2_center = (det2['bbox'][0] + det2['bbox'][2]) / 2
+                    y2_center = (det2['bbox'][1] + det2['bbox'][3]) / 2
+                    distance = math.sqrt((x1_center - x2_center)**2 + (y1_center - y2_center)**2)
+                    if distance < threshold:
+                        cluster.append(det2)
+                        used.add(j)
+                clusters.append(cluster)
+            # 為每個聚類生成代表性檢測結果
+            for cluster in clusters:
+                best_detection = max(cluster, key=lambda x: x['confidence'])
+                avg_confidence = sum(det['confidence'] for det in cluster) / len(cluster)
+                best_detection['confidence'] = avg_confidence
+                best_detection['cluster_size'] = len(cluster)
+                clustered_results.append(best_detection)
+        return clustered_results
+    def analyze_frame_detections(self, detections: Any, timestamp: float, class_names: Dict[int, str]):
+        """分析單幀的檢測結果並更新統計記錄"""
+        if not hasattr(detections, 'boxes') or len(detections.boxes) == 0:
+            self.frame_detections.append([])
+            self.frame_timestamps.append(timestamp)
+            return
+        # extract detected data
+        boxes = detections.boxes.xyxy.cpu().numpy()
+        classes = detections.boxes.cls.cpu().numpy().astype(int)
+        confidences = detections.boxes.conf.cpu().numpy()
+        # 轉換為統一的檢測格式
+        frame_detections = []
+        for box, cls_id, conf in zip(boxes, classes, confidences):
+            if conf >= self.confidence_filter_threshold:
+                frame_detections.append({
+                    'bbox': tuple(box),
+                    'class_id': cls_id,
+                    'class_name': class_names.get(cls_id, f'class_{cls_id}'),
+                    'confidence': conf,
+                    'timestamp': timestamp
+                })
+        # 為了避免有重複偵測, 用空間聚類
+        clustered_detections = self.cluster_detections_by_position(
+            frame_detections, self.spatial_cluster_threshold
+        )
+        # record results
+        self.frame_detections.append(clustered_detections)
+        self.frame_timestamps.append(timestamp)
+        # 更新物體時間線記錄
+        for detection in clustered_detections:
+            class_name = detection['class_name']
+            self.object_timeline[class_name].append({
+                'timestamp': timestamp,
+                'confidence': detection['confidence'],
+                'bbox': detection['bbox']
+            })
+    def generate_object_statistics(self, fps: float) -> Dict[str, ObjectRecord]:
+        """生成物體統計數據"""
+        object_stats = {}
+        for class_name, timeline in self.object_timeline.items():
+            if not timeline:
+                continue
+            # 計算基本時間統計
+            timestamps = [entry['timestamp'] for entry in timeline]
+            confidences = [entry['confidence'] for entry in timeline]
+            first_seen = min(timestamps)
+            last_seen = max(timestamps)
+            total_detections = len(timeline)
+            avg_confidence = sum(confidences) / len(confidences)
+            # 計算每個時間點的物體數量以確定峰值
+            frame_counts = defaultdict(int)
+            for entry in timeline:
+                frame_timestamp = entry['timestamp']
+                frame_counts[frame_timestamp] += 1
+            peak_count = max(frame_counts.values()) if frame_counts else 1
+            # 創建物體記錄
+            object_stats[class_name] = ObjectRecord(
+                class_name=class_name,
+                first_seen_time=first_seen,
+                last_seen_time=last_seen,
+                total_detections=total_detections,
+                peak_count_in_frame=peak_count,
+                confidence_avg=avg_confidence
+            )
+        return object_stats
+    def analyze_object_density(self, object_stats: Dict[str, ObjectRecord], video_duration: float) -> Dict[str, Any]:
+        """分析物體密度和活動模式"""
+        total_objects = sum(record.peak_count_in_frame for record in object_stats.values())
+        objects_per_minute = (total_objects / video_duration) * 60 if video_duration > 0 else 0
+        # 分析每30秒時間段的活動分布
+        time_segments = defaultdict(int)
+        segment_duration = 30
+        for detections, timestamp in zip(self.frame_detections, self.frame_timestamps):
+            segment = int(timestamp // segment_duration) * segment_duration
+            time_segments[segment] += len(detections)
+        # 辨識活動高峰時段
+        peak_segments = []
+        if time_segments:
+            max_activity = max(time_segments.values())
+            threshold = max_activity * 0.8  # 80%活動量代表高度活躍
+            for segment, activity in time_segments.items():
+                if activity >= threshold:
+                    peak_segments.append({
+                        'start_time': segment,
+                        'end_time': min(segment + segment_duration, video_duration),
+                        'activity_count': activity,
+                        'description': f"{segment}s-{min(segment + segment_duration, video_duration):.0f}s"
+                    })
+        return {
+            'total_objects_detected': total_objects,
+            'objects_per_minute': round(objects_per_minute, 2),
+            'video_duration_seconds': video_duration,
+            'peak_activity_periods': peak_segments,
+            'activity_distribution': {str(k): v for k, v in time_segments.items()}
+        }
+    def analyze_quality_metrics(self, object_stats: Dict[str, ObjectRecord]) -> Dict[str, Any]:
+        """分析檢測品質指標"""
+        all_confidences = []
+        class_confidence_stats = {}
+        # 收集所有置信度數據進行分析
+        for class_name, record in object_stats.items():
+            class_confidences = []
+            for detection_data in self.object_timeline[class_name]:
+                conf = detection_data['confidence']
+                all_confidences.append(conf)
+                class_confidences.append(conf)
+            # 計算各類別的置信度統計
+            if class_confidences:
+                class_confidence_stats[class_name] = {
+                    'average_confidence': round(np.mean(class_confidences), 3),
+                    'min_confidence': round(np.min(class_confidences), 3),
+                    'max_confidence': round(np.max(class_confidences), 3),
+                    'confidence_stability': round(1 - np.std(class_confidences), 3),
+                    'detection_count': len(class_confidences)
+                }
+        # 計算整體品質指標
+        if all_confidences:
+            overall_confidence = np.mean(all_confidences)
+            confidence_std = np.std(all_confidences)
+            # 品質等級評估
+            if overall_confidence > 0.8 and confidence_std < 0.1:
+                quality_grade = "excellent"
+            elif overall_confidence > 0.6 and confidence_std < 0.2:
+                quality_grade = "good"
+            elif overall_confidence > 0.4:
+                quality_grade = "fair"
+            else:
+                quality_grade = "poor"
+            quality_analysis = f"Detection quality: {quality_grade} (avg confidence: {overall_confidence:.3f})"
+        else:
+            overall_confidence = 0
+            confidence_std = 0
+            quality_grade = "no_data"
+            quality_analysis = "No detection data available for quality analysis"
+        return {
+            'overall_confidence': round(overall_confidence, 3),
+            'confidence_stability': round(1 - confidence_std, 3),
+            'quality_grade': quality_grade,
+            'class_confidence_breakdown': class_confidence_stats,
+            'total_detections_analyzed': len(all_confidences),
+            'quality_analysis': quality_analysis
+        }
+    def generate_timeline_analysis(self, object_stats: Dict[str, ObjectRecord], video_duration: float) -> Dict[str, Any]:
+        """生成時間線分析報告"""
+        timeline_analysis = {
+            'video_duration_seconds': video_duration,
+            'object_appearances': {},
+            'timeline_summary': []
+        }
+        # 分析每個物體的出現的時序
+        for class_name, record in object_stats.items():
+            timeline_analysis['object_appearances'][class_name] = {
+                'first_appearance': record.format_time(record.first_seen_time),
+                'first_appearance_seconds': round(record.first_seen_time, 1),
+                'last_seen': record.format_time(record.last_seen_time),
+                'last_seen_seconds': round(record.last_seen_time, 1),
+                'duration_in_video': record.format_time(record.get_duration()),
+                'duration_seconds': round(record.get_duration(), 1),
+                'estimated_count': record.peak_count_in_frame,
+                'detection_confidence': round(record.confidence_avg, 3)
+            }
+        # timeline summary
+        if object_stats:
+            sorted_objects = sorted(object_stats.values(), key=lambda x: x.first_seen_time)
+            for i, record in enumerate(sorted_objects):
+                if record.first_seen_time < 2.0:
+                    summary = f"{record.peak_count_in_frame} {record.class_name}(s) present from the beginning"
+                else:
+                    summary = f"{record.peak_count_in_frame} {record.class_name}(s) first appeared at {record.format_time(record.first_seen_time)}"
+                timeline_analysis['timeline_summary'].append(summary)
+        return timeline_analysis
+    def draw_simple_annotations(self, frame: np.ndarray, detections: List[Dict]) -> np.ndarray:
+        """��視頻幀上繪製檢測標註"""
+        annotated_frame = frame.copy()
+        # 不同物體類別分配顏色
+        colors = {
+            'person': (0, 255, 0),     # green
+            'car': (255, 0, 0),        # blue
+            'truck': (0, 0, 255),      # red
+            'bus': (255, 255, 0),      # 青色
+            'bicycle': (255, 0, 255),  # purple
+            'motorcycle': (0, 255, 255) # yellow
+        }
+        # 繪製每個檢測結果
+        for detection in detections:
+            x1, y1, x2, y2 = map(int, detection['bbox'])
+            class_name = detection['class_name']
+            confidence = detection['confidence']
+            color = colors.get(class_name, (128, 128, 128))  # set gray to default color
+            # 繪製邊界框
+            cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), color, 2)
+            # 準備標籤文字
+            label = f"{class_name}: {confidence:.2f}"
+            if 'cluster_size' in detection and detection['cluster_size'] > 1:
+                label += f" (merged: {detection['cluster_size']})"
+            # 繪製標籤背景和文字
+            (w, h), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
+            cv2.rectangle(annotated_frame, (x1, y1 - h - 10), (x1 + w, y1), color, -1)
+            cv2.putText(annotated_frame, label, (x1, y1 - 5),
+                       cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
+        return annotated_frame
+    def _ensure_string_keys(self, data):
+        """確保所有字典鍵值都轉換為字串格式以支援JSON序列化"""
+        if isinstance(data, dict):
+            return {str(key): self._ensure_string_keys(value) for key, value in data.items()}
+        elif isinstance(data, list):
+            return [self._ensure_string_keys(item) for item in data]
+        else:
+            return data
+    def process_video(self,
+                     video_path: str,
+                     model_name: str,
+                     confidence_threshold: float,
+                     process_interval: int = 10) -> Tuple[Optional[str], Dict[str, Any]]:
         """
+        處理視頻文件，執行物體檢測和統計分析
         Args:
+            video_path: 視頻文件路徑
+            model_name: YOLO模型名稱
+            confidence_threshold: 置信度閾值
+            process_interval: 處理間隔（每N幀處理一次）
         Returns:
+            Tuple[Optional[str], Dict[str, Any]]: (輸出視頻路徑, 分析結果)
         """
         if not video_path or not os.path.exists(video_path):
             print(f"Error: Video file not found at {video_path}")
+            return None, {"error": "Video file not found"}
+        print(f"Starting focused video analysis: {video_path}")
         start_time = time.time()
+        # 重置處理狀態
+        self.frame_detections.clear()
+        self.object_timeline.clear()
+        self.frame_timestamps.clear()
+        # 開啟視頻文件
         cap = cv2.VideoCapture(video_path)
         if not cap.isOpened():
+            return None, {"error": "Could not open video file"}
+        # 取得視頻基本屬性
+        fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        video_duration = total_frames / fps
+        print(f"Video properties: {width}x{height} @ {fps:.2f} FPS")
+        print(f"Duration: {video_duration:.1f}s, Total frames: {total_frames}")
+        print(f"Processing every {process_interval} frames")
+        # 設定輸出視頻文件
+        output_filename = f"analyzed_{uuid.uuid4().hex}_{os.path.basename(video_path)}"
+        temp_dir = tempfile.gettempdir()
         output_path = os.path.join(temp_dir, output_filename)
         if not output_path.lower().endswith(('.mp4', '.avi', '.mov')):
             output_path += ".mp4"
         fourcc = cv2.VideoWriter_fourcc(*'mp4v')
         out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
         if not out.isOpened():
             cap.release()
+            return None, {"error": "Could not create output video file"}
         print(f"Output video will be saved to: {output_path}")
+        # 載入檢測模型
+        try:
+            detection_model = self.get_or_create_model(model_name, confidence_threshold)
+        except Exception as e:
+            cap.release()
+            out.release()
+            return None, {"error": f"Failed to load detection model: {str(e)}"}
+        # 主要視頻處理循環
         frame_count = 0
         processed_frame_count = 0
         try:
             while True:
                 ret, frame = cap.read()
                 if not ret:
+                    break
                 frame_count += 1
+                timestamp = frame_count / fps
+                # 根據處理間隔決定是否分析此幀
                 if frame_count % process_interval == 0:
                     processed_frame_count += 1
+                    if processed_frame_count % 5 == 0:
+                        print(f"Processing frame {frame_count}/{total_frames} ({timestamp:.1f}s)")
                     try:
+                        # 執行物體檢測
                         frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                         pil_image = Image.fromarray(frame_rgb)
+                        detections = detection_model.detect(pil_image)
+                        # 分析檢測結果
+                        class_names = detections.names if hasattr(detections, 'names') else {}
+                        self.analyze_frame_detections(detections, timestamp, class_names)
+                        # 繪製檢測標註
+                        current_detections = self.frame_detections[-1] if self.frame_detections else []
+                        frame = self.draw_simple_annotations(frame, current_detections)
                     except Exception as e:
+                        print(f"Error processing frame {frame_count}: {e}")
+                        continue
+                # 寫入處理後的幀到輸出視頻
+                out.write(frame)
         except Exception as e:
+            print(f"Error during video processing: {e}")
             traceback.print_exc()
         finally:
             cap.release()
             out.release()
+        # 生成最終分析結果
+        processing_time = time.time() - start_time
+        # 執行各項統計分析
+        object_stats = self.generate_object_statistics(fps)
+        object_density = self.analyze_object_density(object_stats, video_duration)
+        quality_metrics = self.analyze_quality_metrics(object_stats)
+        timeline_analysis = self.generate_timeline_analysis(object_stats, video_duration)
+        # 計算基本統計數據
+        total_unique_objects = sum(record.peak_count_in_frame for record in object_stats.values())
+        # 組織分析結果
+        analysis_results = {
+            "processing_info": {
+                "processing_time_seconds": round(processing_time, 2),
+                "total_frames": frame_count,
+                "frames_analyzed": processed_frame_count,
+                "processing_interval": process_interval,
+                "video_duration_seconds": round(video_duration, 2),
+                "fps": fps
+            },
+            "object_summary": {
+                "total_unique_objects_detected": total_unique_objects,
+                "object_types_found": len(object_stats),
+                "detailed_counts": {
+                    name: record.peak_count_in_frame
+                    for name, record in object_stats.items()
+                }
+            },
+            "timeline_analysis": timeline_analysis,
+            "analytics": {
+                "object_density": object_density,
+                "quality_metrics": quality_metrics
             }
+        }
+        # 確保所有字典鍵值都是字串格式
+        analysis_results = self._ensure_string_keys(analysis_results)
+        # 驗證輸出文件
+        if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
+            print(f"Warning: Output video file was not created properly")
+            return None, analysis_results
+        print(f"Video processing completed in {processing_time:.2f} seconds")
+        print(f"Found {total_unique_objects} total objects across {len(object_stats)} categories")
+        print(f"Quality grade: {quality_metrics['quality_grade']}")
+        return output_path, analysis_results