File size: 18,205 Bytes
6a3bd1f
 
 
 
f3a4ad9
6a3bd1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3a4ad9
6a3bd1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3a4ad9
 
 
6a3bd1f
 
f3a4ad9
6a3bd1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3a4ad9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
import sys
import time
import traceback
from PIL import Image
from typing import Dict, List, Callable, Optional

from image_processor_manager import ImageProcessorManager
from yolo_detection_manager import YOLODetectionManager
from saliency_detection_manager import SaliencyDetectionManager
from openclip_semantic_manager import OpenCLIPSemanticManager
from lighting_analysis_manager import LightingAnalysisManager
from ocr_engine_manager import OCREngineManager
from prompt_library_manager import PromptLibraryManager
from brand_recognition_manager import BrandRecognitionManager
from brand_visualization_manager import BrandVisualizationManager
from brand_verification_manager import BrandVerificationManager
from scene_compatibility_manager import SceneCompatibilityManager
from caption_generation_manager import CaptionGenerationManager
from detection_fusion_manager import DetectionFusionManager
from output_processing_manager import OutputProcessingManager
from batch_processing_manager import BatchProcessingManager

class PixcribePipeline:
    """Main Facade coordinating all components (V2 with multi-language support)"""

    def __init__(self, yolo_variant='l', vlm_model_name='Qwen/Qwen2.5-VL-7B-Instruct'):
        """
        Args:
            yolo_variant: 'm', 'l' (default), or 'x'
            vlm_model_name: Vision-Language Model name (default: Qwen2.5-VL-7B-Instruct)
                Can be changed to 'Qwen/Qwen3-VL-8B-Instruct' for latest model
        """
        print("="*60)
        print("Initializing Pixcribe Pipeline V2...")
        print("="*60)

        start_time = time.time()

        # Initialize all managers
        self.image_processor = ImageProcessorManager()
        self.yolo_detector = YOLODetectionManager(variant=yolo_variant)
        self.saliency_detector = SaliencyDetectionManager()
        self.clip_semantic = OpenCLIPSemanticManager()
        self.lighting_analyzer = LightingAnalysisManager()
        self.ocr_engine = OCREngineManager()

        # NEW: Initialize PromptLibrary (centralized prompt management)
        self.prompt_library = PromptLibraryManager()

        # Initialize BrandRecognitionManager with PromptLibrary
        self.brand_recognizer = BrandRecognitionManager(
            self.clip_semantic, self.ocr_engine, self.prompt_library
        )

        # NEW: Brand visualization manager
        self.brand_visualizer = BrandVisualizationManager()

        self.caption_generator = CaptionGenerationManager(model_name=vlm_model_name)

        # NEW: Brand verification with VLM
        self.brand_verifier = BrandVerificationManager(self.caption_generator)

        # NEW: Scene compatibility checker
        self.scene_compatibility = SceneCompatibilityManager(self.prompt_library)

        self.fusion_manager = DetectionFusionManager(self.clip_semantic)

        # Initialize OutputProcessingManager with PromptLibrary for smart hashtag generation
        self.output_processor = OutputProcessingManager(self.prompt_library)

        # Initialize BatchProcessingManager with pipeline reference
        self.batch_processor = BatchProcessingManager(pipeline=self)

        elapsed = time.time() - start_time
        print("="*60)
        print(f"✓ Pipeline V5 initialized successfully with batch processing (Time: {elapsed:.2f}s)")
        print("="*60)

    def process_image(self, image, platform='instagram', yolo_variant='l', language='zh') -> Dict:
        """End-to-end image processing pipeline

        Args:
            image: PIL Image or path
            platform: 'instagram', 'tiktok', or 'xiaohongshu'
            yolo_variant: 'm', 'l' (default), or 'x'
            language: 'zh' (Traditional Chinese), 'en' (English), 'zh-en' (Bilingual)

        Returns:
            Processing results dictionary with brand visualizations
        """
        print(f"\nProcessing image (Platform: {platform}, Language: {language})...")
        start_time = time.time()

        try:
            # Step 1: Preprocessing
            print("[1/9] Preprocessing image...")
            processed_img = self.image_processor.load_image(image)
            yolo_input = self.image_processor.preprocess_for_yolo(processed_img)

            # Step 2: Parallel detection
            print("[2/9] YOLO object detection...")
            yolo_results = self.yolo_detector.detect(yolo_input)
            print(f"      Detected {len(yolo_results)} objects")

            print("[3/9] Saliency detection...")
            salient_regions = self.saliency_detector.detect_salient_regions(processed_img)
            print(f"      Found {len(salient_regions)} salient regions")

            # Step 3: Identify unknown objects
            print("[4/9] Identifying unknown objects...")
            unknown_regions = self.saliency_detector.extract_unknown_regions(
                salient_regions, yolo_results
            )
            print(f"      Found {len(unknown_regions)} unknown regions")

            # Step 4: Brand recognition (with bounding boxes)
            print("[5/9] Brand recognition...")
            brands = []
            brand_detections = []  # For visualization

            # Method 1: Check YOLO-detected brand-relevant objects
            brand_relevant = self.yolo_detector.filter_brand_relevant_objects(yolo_results)
            if brand_relevant:
                print(f"      Checking {len(brand_relevant)} YOLO brand-relevant objects...")
                for det in brand_relevant[:5]:  # Check top 5 brand-relevant objects
                    region = processed_img.crop(det['bbox'])
                    brand_result = self.brand_recognizer.recognize_brand(
                        region, processed_img, region_bbox=det['bbox']
                    )

                    if brand_result:
                        for brand_name, confidence, bbox in brand_result[:2]:  # Top 2 brands per region
                            brands.append((brand_name, confidence))

                            # Prepare for visualization
                            brand_info = self.prompt_library.get_brand_prompts(brand_name)
                            category = brand_info.get('category', 'default') if brand_info else 'default'

                            brand_detections.append({
                                'name': brand_name,
                                'confidence': confidence,
                                'bbox': bbox,
                                'category': category
                            })

            # Method 2: Full-image brand scan (商業級必要功能)
            # 無論 YOLO 是否檢測到相關物體,都執行全圖品牌掃描
            print("      Performing intelligent full-image brand scan...")
            full_image_brands = self.brand_recognizer.scan_full_image_for_brands(
                processed_img,
                exclude_bboxes=[bd['bbox'] for bd in brand_detections if bd.get('bbox')],
                saliency_regions=salient_regions  # 傳遞顯著性區域以智能選擇掃描區域
            )

            # 合併全圖掃描結果
            if full_image_brands:
                print(f"      Full-image scan found {len(full_image_brands)} additional brands")
                for brand_name, confidence, bbox in full_image_brands:
                    # 避免重複檢測同一品牌
                    if not any(bd['name'] == brand_name for bd in brand_detections):
                        brands.append((brand_name, confidence))

                        brand_info = self.prompt_library.get_brand_prompts(brand_name)
                        category = brand_info.get('category', 'default') if brand_info else 'default'

                        brand_detections.append({
                            'name': brand_name,
                            'confidence': confidence,
                            'bbox': bbox,
                            'category': category
                        })

            print(f"      Identified {len(brands)} brand instances (before verification)")

            # Step 4.5: CLIP scene understanding (moved earlier for compatibility check)
            print("[5.5/11] Scene understanding (CLIP)...")
            scene_analysis = self.clip_semantic.analyze_scene(processed_img)
            print(f"      Scene: {scene_analysis.get('urban', {}).get('top', 'unknown')}")

            # Step 4.6: Scene compatibility check
            if brands:
                print("[5.6/11] Checking scene compatibility...")
                brands_with_bbox = [(b[0], b[1], brand_detections[i]['bbox'])
                                    for i, b in enumerate(brands)]
                compatible_brands = self.scene_compatibility.batch_check_compatibility(
                    brands_with_bbox, scene_analysis
                )
                print(f"      {len(compatible_brands)} brands passed compatibility check")

                # Update brands and brand_detections
                if compatible_brands:
                    brands = [(b[0], b[1]) for b in compatible_brands]
                    brand_detections = []
                    for brand_name, confidence, bbox in compatible_brands:
                        brand_info = self.prompt_library.get_brand_prompts(brand_name)
                        category = brand_info.get('category', 'default') if brand_info else 'default'
                        brand_detections.append({
                            'name': brand_name,
                            'confidence': confidence,
                            'bbox': bbox,
                            'category': category
                        })
                else:
                    brands = []
                    brand_detections = []

            # Step 4.7: VLM brand verification
            if brand_detections:
                print("[5.7/11] VLM brand verification...")
                vlm_verification = self.brand_verifier.verify_brands(
                    processed_img, [(bd['name'], bd['confidence'], bd['bbox'])
                                   for bd in brand_detections]
                )
                print(f"      VLM verified {len(vlm_verification.get('verified_brands', []))} brands")

                # Three-way voting: OpenCLIP + OCR + VLM
                # Collect OCR matches for voting
                ocr_brands = {}
                for brand_name, conf in brands:
                    if brand_name not in ocr_brands:
                        ocr_brands[brand_name] = (0.5, conf)  # Approximate text/ocr split

                final_brands = self.brand_verifier.three_way_voting(
                    [(bd['name'], bd['confidence'], bd['bbox']) for bd in brand_detections],
                    ocr_brands,
                    vlm_verification
                )
                print(f"      Final verified brands: {len(final_brands)}")

                # Update brands and brand_detections with verified results
                if final_brands:
                    brands = [(b[0], b[1]) for b in final_brands]
                    brand_detections = []
                    for brand_name, confidence, bbox in final_brands:
                        brand_info = self.prompt_library.get_brand_prompts(brand_name)
                        category = brand_info.get('category', 'default') if brand_info else 'default'
                        brand_detections.append({
                            'name': brand_name,
                            'confidence': confidence,
                            'bbox': bbox,
                            'category': category
                        })
                else:
                    brands = []
                    brand_detections = []

            # NEW: Visualize brand detections on image
            if brand_detections:
                visualized_image = self.brand_visualizer.draw_brand_detections(
                    processed_img.copy(), brand_detections
                )
            else:
                visualized_image = processed_img

            # Step 6: CV-based lighting analysis
            print("[7/11] Analyzing lighting conditions...")
            cv_lighting = self.lighting_analyzer.analyze_lighting(processed_img)
            print(f"      CV Lighting: {cv_lighting['lighting_type']} (confidence: {cv_lighting['confidence']:.2f})")
            print(f"      Details: brightness={cv_lighting['cv_features']['brightness']:.1f}, "
                  f"temp_ratio={cv_lighting['cv_features']['color_temp']:.2f}, "
                  f"contrast={cv_lighting['cv_features']['contrast']:.1f}")

            # Step 7: Additional scene analysis details
            print("[8/11] Additional scene analysis...")
            print(f"      CLIP Lighting: {scene_analysis.get('lighting', {}).get('top', 'unknown')}")
            print(f"      Mood: {scene_analysis.get('mood', {}).get('top', 'unknown')}")

            # Step 8: Fusion with lighting analysis
            print("[9/11] Fusing detection results...")
            fused_results = self.fusion_manager.fuse_detections(
                yolo_results, unknown_regions, scene_analysis, processed_img, cv_lighting
            )
            fused_results['brands'] = brands
            fused_results['scene_analysis'] = scene_analysis

            # Print fused lighting result
            fused_lighting = fused_results['scene_analysis']['lighting']['top']
            print(f"      Fused Lighting: {fused_lighting}")

            # Step 9: Caption generation with language support
            print("[10/11] Generating captions...")
            captions = self.caption_generator.generate_captions(
                fused_results, processed_img, platform, language
            )

            # Step 10: Output processing with smart hashtags
            print("[11/11] Output processing...")
            validated_captions = []
            for caption in captions:
                # Only generate hashtags if VLM didn't generate any
                # DO NOT override VLM hashtags - they follow language requirements
                if not caption.get('hashtags') or len(caption.get('hashtags', [])) < 3:
                    print(f"  [DEBUG] Caption has {len(caption.get('hashtags', []))} hashtags, generating smart hashtags...")
                    caption['hashtags'] = self.output_processor.generate_smart_hashtags(
                        fused_results['detections'],
                        scene_analysis,
                        brands,
                        platform,
                        language
                    )
                else:
                    print(f"  [DEBUG] Caption has {len(caption['hashtags'])} VLM-generated hashtags")

                # 傳遞完整參數給 validate_output 以啟用標籤自動補充
                is_valid, msg = self.output_processor.validate_output(
                    caption, platform,
                    detections=fused_results['detections'],
                    scene_info=scene_analysis,
                    brands=brands,
                    language=language
                )
                if is_valid:
                    validated_captions.append(caption)
                else:
                    print(f"  [DEBUG] Caption validation failed: {msg}")

            elapsed = time.time() - start_time
            print(f"\n✓ Processing complete (Total time: {elapsed:.2f}s)")
            print(f"  Generated {len(validated_captions)} caption variations")

            return {
                'captions': validated_captions,
                'detections': fused_results['detections'],
                'brands': brands,
                'brand_detections': brand_detections,  # NEW: For UI display
                'visualized_image': visualized_image,  # NEW: Image with brand boxes
                'scene': scene_analysis,
                'composition': fused_results.get('composition', {}),
                'lighting': cv_lighting,
                'processing_time': elapsed
            }

        except Exception as e:
            print(f"\n✗ Processing error: {str(e)}")
            traceback.print_exc()
            # Re-raise exception so it can be caught and displayed
            raise

    def process_batch(
        self,
        images: List[Image.Image],
        platform: str = 'instagram',
        yolo_variant: str = 'l',
        language: str = 'zh',
        progress_callback: Optional[Callable] = None
    ) -> Dict:
        """
        Process multiple images in batch with progress tracking.

        This method provides a Facade interface to the BatchProcessingManager,
        allowing batch processing through the main Pipeline API.

        Args:
            images: List of PIL Image objects to process (max 10)
            platform: Target social media platform ('instagram', 'tiktok', 'xiaohongshu')
            yolo_variant: YOLO model variant ('m', 'l', 'x')
            language: Caption language ('zh' for Traditional Chinese, 'en' for English)
            progress_callback: Optional callback function for progress updates

        Returns:
            Dictionary containing:
                - results: Dict mapping image index to processing results
                - total_processed: Total number of images processed
                - total_success: Number of successfully processed images
                - total_failed: Number of failed images
                - total_time: Total processing time in seconds
                - average_time_per_image: Average time per image in seconds

        Raises:
            ValueError: If images list is empty or exceeds 10 images

        Example:
            >>> images = [Image.open(f'image{i}.jpg') for i in range(1, 6)]
            >>> results = pipeline.process_batch(images, platform='instagram')
            >>> print(f"Processed {results['total_success']}/{results['total_processed']} images")
        """
        return self.batch_processor.process_batch(
            images=images,
            platform=platform,
            yolo_variant=yolo_variant,
            language=language,
            progress_callback=progress_callback
        )

print("✓ PixcribePipeline V5 (with Batch Processing) defined")