Spaces:

DawnC
/

Pixcribe

Sleeping

File size: 9,128 Bytes

6a3bd1f

from typing import List, Dict
import numpy as np

class DetectionFusionManager:
    """Integrate and prioritize detection results with intelligent lighting fusion"""

    def __init__(self, clip_manager):
        self.clip_manager = clip_manager

    def fuse_lighting_analysis(self, cv_lighting: Dict, clip_scene: Dict) -> Dict:
        """Intelligently fuse CV+Places365 lighting with CLIP scene understanding"""

        cv_lighting_type = cv_lighting.get('lighting_type', 'soft diffused light')
        cv_confidence = cv_lighting.get('confidence', 0.7)
        cv_features = cv_lighting.get('cv_features', {})

        # Get CLIP's lighting prediction
        clip_lighting_data = clip_scene.get('lighting', {})
        clip_lighting_type = clip_lighting_data.get('top', 'natural light')
        clip_confidence = clip_lighting_data.get('confidence', 0.5)

        # Intelligent fusion strategy:
        # 1. If CV has high confidence (>0.85), trust it
        # 2. If CV and CLIP semantically agree, boost confidence
        # 3. Otherwise, weighted average based on confidence

        if cv_confidence > 0.85:
            # High confidence from CV+Places365
            final_lighting = cv_lighting_type
            final_confidence = cv_confidence
            fusion_method = 'cv_dominant'

        elif self._lighting_semantically_similar(cv_lighting_type, clip_lighting_type):
            # Semantic agreement between CV and CLIP
            final_lighting = cv_lighting_type  # Prefer CV's specific description
            # Boost confidence when both agree
            final_confidence = min(cv_confidence * 1.15, 0.95)
            fusion_method = 'consensus'

        else:
            # Weighted fusion based on confidence
            cv_weight = cv_confidence / (cv_confidence + clip_confidence)
            clip_weight = 1.0 - cv_weight

            # If CV weight is higher, use CV result
            if cv_weight > 0.6:
                final_lighting = cv_lighting_type
                final_confidence = cv_confidence * 0.9
                fusion_method = 'cv_weighted'
            else:
                # Use more generic description when uncertain
                final_lighting = self._generalize_lighting_description(
                    cv_lighting_type, clip_lighting_type, cv_features
                )
                final_confidence = (cv_confidence * cv_weight + clip_confidence * clip_weight) * 0.85
                fusion_method = 'generalized'

        return {
            'lighting_type': final_lighting,
            'confidence': min(final_confidence, 0.95),
            'cv_analysis': cv_lighting_type,
            'clip_prediction': clip_lighting_type,
            'fusion_method': fusion_method,
            'cv_confidence': cv_confidence,
            'clip_confidence': clip_confidence
        }

    def _lighting_semantically_similar(self, cv_type: str, clip_type: str) -> bool:
        """Check if two lighting descriptions are semantically similar"""
        # Define semantic similarity groups
        similarity_groups = [
            {'soft', 'diffused', 'overcast', 'cloudy'},
            {'bright', 'sunny', 'sunlight', 'clear'},
            {'warm', 'golden', 'amber', 'evening'},
            {'natural', 'daylight', 'outdoor'},
            {'cool', 'blue', 'twilight'},
        ]

        cv_words = set(cv_type.lower().split())
        clip_words = set(clip_type.lower().split())

        # Check if both descriptions share words from same semantic group
        for group in similarity_groups:
            cv_match = cv_words & group
            clip_match = clip_words & group
            if cv_match and clip_match:
                return True

        # Direct word overlap
        common_words = cv_words & clip_words
        return len(common_words) >= 1

    def _generalize_lighting_description(self, cv_type: str, clip_type: str,
                                         cv_features: Dict) -> str:
        """Generate a generalized lighting description when CV and CLIP disagree"""

        brightness = cv_features.get('brightness', 128)
        contrast = cv_features.get('contrast', 50)
        color_temp = cv_features.get('color_temp', 1.0)

        # Use feature-based generalization (not hard thresholds)
        brightness_norm = brightness / 255.0
        contrast_norm = min(contrast / 100.0, 1.0)

        # Decision tree based on physical features
        if contrast_norm < 0.5:
            # Low contrast
            if color_temp < 1.0:
                return 'soft diffused light'
            else:
                return 'warm ambient light'
        elif brightness_norm > 0.7:
            # High brightness
            return 'natural daylight'
        elif color_temp > 1.1:
            # Warm temperature
            return 'warm ambient light'
        else:
            # Default safe description
            return 'soft diffused light'

    def analyze_composition(self, image, detections: List[Dict]) -> Dict:
        """Analyze image composition"""
        if not detections:
            return {'composition_type': 'empty', 'vertical_ratio': 0.0}

        # Calculate vertical element ratio
        vertical_objects = [
            d for d in detections
            if (d['bbox'][3] - d['bbox'][1]) > (d['bbox'][2] - d['bbox'][0])
        ]
        vertical_ratio = len(vertical_objects) / max(len(detections), 1)

        # Determine composition type
        if vertical_ratio > 0.6:
            composition_type = 'urban canyon'
        elif vertical_ratio > 0.4:
            composition_type = 'vertical emphasis'
        else:
            composition_type = 'standard street view'

        return {
            'composition_type': composition_type,
            'vertical_ratio': vertical_ratio,
            'vertical_objects_count': len(vertical_objects),
            'total_objects': len(detections)
        }

    def fuse_detections(self, yolo_results: List[Dict], unknown_regions: List[Dict],
                       scene_info: Dict, image=None, cv_lighting: Dict = None) -> Dict:
        """Fuse all detection results with intelligent lighting fusion"""
        all_detections = []

        # Process YOLO detections with attention scores
        for det in yolo_results:
            attention_score = self._calculate_attention_score(det)
            det['attention_score'] = attention_score
            all_detections.append(det)

        # Classify unknown regions using OpenCLIP
        for region in unknown_regions:
            if 'image' not in region:
                continue

            classification = self.clip_manager.classify_hierarchical(region['image'])

            detection = {
                'class_name': classification['top_prediction'],
                'bbox': region['bbox'],
                'confidence': classification.get('confidence', 0.5),
                'attention_score': region.get('saliency_score', 0.5),
                'source': 'openclip'
            }
            all_detections.append(detection)

        # Sort by attention score
        ranked_detections = sorted(
            all_detections,
            key=lambda x: x['attention_score'],
            reverse=True
        )

        # Filter top 15
        filtered = []
        for det in ranked_detections:
            if len(filtered) >= 15:
                if det.get('brand') and det.get('brand_confidence', 0) > 0.45:
                    filtered.append(det)
                else:
                    break
            else:
                filtered.append(det)

        # Analyze composition
        composition = self.analyze_composition(image, filtered) if image else {}

        # Intelligent lighting fusion
        if cv_lighting:
            fused_lighting = self.fuse_lighting_analysis(cv_lighting, scene_info)
            # Update scene_info with fused lighting
            scene_info['lighting'] = {
                'top': fused_lighting['lighting_type'],
                'confidence': fused_lighting['confidence'],
                'fusion_details': fused_lighting
            }

        return {
            'detections': filtered,
            'scene_info': scene_info,
            'composition': composition,
            'total_objects': len(all_detections)
        }

    def _calculate_attention_score(self, detection: Dict) -> float:
        """Calculate attention score based on position, size, and confidence"""
        bbox = detection['bbox']
        x1, y1, x2, y2 = bbox

        center_x = (x1 + x2) / 2
        center_y = (y1 + y2) / 2

        if x2 > 100:
            position_score = 0.5
        else:
            position_score = 1.0 - (abs(center_x - 0.5) + abs(center_y - 0.5))

        area = abs((x2 - x1) * (y2 - y1))
        if x2 > 100:
            area = area / (1000 * 1000)
        size_score = min(area, 0.5)

        conf_score = detection.get('confidence', 0.5)

        attention = (
            0.3 * position_score +
            0.3 * size_score +
            0.4 * conf_score
        )

        return attention

print("✓ DetectionFusionManager (V2 with intelligent fusion) defined")