Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import uuid | |
| from typing import List, Dict, Any, Optional, Tuple | |
| import sys | |
| import subprocess | |
| # ========================== | |
| # Установка sentencepiece для моделей перевода Helsinki-NLP | |
| # ========================== | |
| try: | |
| import sentencepiece # noqa | |
| except ImportError: | |
| subprocess.run( | |
| [sys.executable, "-m", "pip", "install", "sentencepiece"], | |
| check=False, | |
| ) | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from PIL import Image, ImageDraw, ImageFont | |
| from scipy.io.wavfile import write as wav_write | |
| from transformers import ( | |
| pipeline, | |
| BlipForImageTextRetrieval, | |
| AutoProcessor, | |
| ) | |
| from transformers.utils import logging as hf_logging | |
| hf_logging.set_verbosity_error() | |
| # ========================== | |
| # Вспомогательные функции | |
| # ========================== | |
| def _ensure_rgb(img: Image.Image) -> Image.Image: | |
| if img.mode != "RGB": | |
| return img.convert("RGB") | |
| return img | |
| def load_font(size: int, bold: bool = False) -> ImageFont.FreeTypeFont | ImageFont.ImageFont: | |
| paths = [] | |
| if bold: | |
| paths.append("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf") | |
| paths.append("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf") | |
| for p in paths: | |
| try: | |
| return ImageFont.truetype(p, size=size) | |
| except Exception: | |
| continue | |
| return ImageFont.load_default() | |
| def wrap_text(text: str, font: ImageFont.ImageFont, max_width: int, draw: ImageDraw.ImageDraw) -> list[str]: | |
| words = text.split() | |
| lines = [] | |
| current = [] | |
| for w in words: | |
| test_line = " ".join(current + [w]) | |
| bbox = draw.textbbox((0, 0), test_line, font=font) | |
| if bbox[2] - bbox[0] <= max_width or not current: | |
| current.append(w) | |
| else: | |
| lines.append(" ".join(current)) | |
| current = [w] | |
| if current: | |
| lines.append(" ".join(current)) | |
| return lines | |
| def create_vertical_gradient(size: tuple[int, int], top_color: tuple[int, int, int], | |
| bottom_color: tuple[int, int, int]) -> Image.Image: | |
| w, h = size | |
| gradient = Image.new("RGB", (1, h)) | |
| top_r, top_g, top_b = top_color | |
| bot_r, bot_g, bot_b = bottom_color | |
| for y in range(h): | |
| alpha = y / max(h - 1, 1) | |
| r = int(top_r * (1 - alpha) + bot_r * alpha) | |
| g = int(top_g * (1 - alpha) + bot_g * alpha) | |
| b = int(top_b * (1 - alpha) + bot_b * alpha) | |
| gradient.putpixel((0, y), (r, g, b)) | |
| gradient = gradient.resize((w, h)) | |
| return gradient | |
| def render_results_in_image( | |
| pil_image: Image.Image, | |
| detections: List[Dict[str, Any]], | |
| score_threshold: float = 0.5, | |
| ) -> Image.Image: | |
| img = _ensure_rgb(pil_image).copy() | |
| draw = ImageDraw.Draw(img) | |
| W, H = img.size | |
| font = ImageFont.load_default() | |
| for det in detections: | |
| score = float(det.get("score", 0.0)) | |
| if score < score_threshold: | |
| continue | |
| lbl = str(det.get("label", "")) | |
| box = det.get("box", {}) | |
| x1 = box.get("xmin", 0) | |
| y1 = box.get("ymin", 0) | |
| x2 = box.get("xmax", 0) | |
| y2 = box.get("ymax", 0) | |
| x1 = max(0, min(W, x1)) | |
| x2 = max(0, min(W, x2)) | |
| y1 = max(0, min(H, y1)) | |
| y2 = max(0, min(H, y2)) | |
| draw.rectangle([(x1, y1), (x2, y2)], outline=(0, 255, 0), width=3) | |
| text = f"{lbl} {score:.2f}" | |
| bbox = draw.textbbox((0, 0), text, font=font) | |
| tw = bbox[2] - bbox[0] | |
| th = bbox[3] - bbox[1] | |
| pad = 2 | |
| tx2 = min(x1 + tw + 2 * pad, W) | |
| ty2 = min(y1 + th + 2 * pad, H) | |
| draw.rectangle([(x1, y1), (tx2, ty2)], fill=(0, 255, 0)) | |
| draw.text((x1 + pad, y1 + pad), text, fill=(0, 0, 0), font=font) | |
| return img | |
| def seg_score(seg: Dict[str, Any]) -> float: | |
| s = seg.get("score", 0.0) | |
| try: | |
| return float(s) | |
| except (TypeError, ValueError): | |
| return 0.0 | |
| def show_masks_on_image(pil_image: Image.Image, masks: list[np.ndarray]) -> Image.Image: | |
| img = pil_image.convert("RGBA") | |
| overlay = Image.new("RGBA", img.size) | |
| for mask_np in masks: | |
| mask_uint8 = (mask_np * 255).astype("uint8") | |
| mask_img = Image.fromarray(mask_uint8).resize(img.size).convert("L") | |
| color = (255, 0, 0, 100) | |
| colored = Image.new("RGBA", img.size, color) | |
| overlay = Image.composite(colored, overlay, mask_img) | |
| combined = Image.alpha_composite(img, overlay) | |
| return combined | |
| # ========================== | |
| # Модели и пайплайны | |
| # ========================== | |
| od_pipe = pipeline("object-detection", model="hustvl/yolos-tiny") | |
| segmentation_pipe = pipeline( | |
| task="image-segmentation", | |
| model="nvidia/segformer-b4-finetuned-ade-512-512", | |
| ) | |
| retrieval_model_name = "Salesforce/blip-itm-base-coco" | |
| retrieval_model = BlipForImageTextRetrieval.from_pretrained(retrieval_model_name) | |
| retrieval_processor = AutoProcessor.from_pretrained(retrieval_model_name) | |
| caption_pipe = pipeline( | |
| "image-to-text", | |
| model="Salesforce/blip-image-captioning-large", | |
| ) | |
| asr_en = pipeline( | |
| task="automatic-speech-recognition", | |
| model="openai/whisper-tiny.en", | |
| ) | |
| asr_ru = pipeline( | |
| task="automatic-speech-recognition", | |
| model="openai/whisper-tiny", | |
| ) | |
| tts_en = pipeline( | |
| task="text-to-speech", | |
| model="suno/bark-small", | |
| ) | |
| tts_ru = pipeline( | |
| task="text-to-speech", | |
| model="suno/bark-small", | |
| ) | |
| nlp_sentiment_en = pipeline( | |
| task="sentiment-analysis", | |
| model="distilbert-base-uncased-finetuned-sst-2-english", | |
| ) | |
| nlp_sentiment_ru = pipeline( | |
| task="sentiment-analysis", | |
| model="blanchefort/rubert-base-cased-sentiment", | |
| ) | |
| nlp_summarizer_en = pipeline( | |
| task="summarization", | |
| model="facebook/bart-large-cnn", | |
| ) | |
| translator_en_ru = pipeline( | |
| "translation", | |
| model="Helsinki-NLP/opus-mt-en-ru", | |
| ) | |
| translator_ru_en = pipeline( | |
| "translation", | |
| model="Helsinki-NLP/opus-mt-ru-en", | |
| ) | |
| # ========================== | |
| # Детекция / сегментация | |
| # ========================== | |
| def get_main_detection_box(pil_image: Image.Image) -> Dict[str, float]: | |
| W, H = pil_image.size | |
| outputs = od_pipe(pil_image) | |
| if not outputs: | |
| return {"xmin": 0, "ymin": 0, "xmax": W, "ymax": H} | |
| def area(det: Dict[str, Any]) -> float: | |
| b = det["box"] | |
| return float((b["xmax"] - b["xmin"]) * (b["ymax"] - b["ymin"])) | |
| best = max(outputs, key=area) | |
| return best["box"] | |
| def compute_iou(box1: Dict[str, float], box2: Dict[str, float]) -> float: | |
| x1 = max(box1["xmin"], box2["xmin"]) | |
| y1 = max(box1["ymin"], box2["ymin"]) | |
| x2 = min(box1["xmax"], box2["xmax"]) | |
| y2 = min(box1["ymax"], box2["ymax"]) | |
| w = max(0.0, x2 - x1) | |
| h = max(0.0, y2 - y1) | |
| inter = w * h | |
| if inter == 0: | |
| return 0.0 | |
| a1 = (box1["xmax"] - box1["xmin"]) * (box1["ymax"] - box1["ymin"]) | |
| a2 = (box2["xmax"] - box2["xmin"]) * (box2["ymax"] - box2["ymin"]) | |
| union = a1 + a2 - inter | |
| if union <= 0: | |
| return 0.0 | |
| return inter / union | |
| def choose_product_mask( | |
| segments: List[Dict[str, Any]], | |
| image_size: Tuple[int, int], | |
| det_box: Dict[str, float], | |
| ) -> Optional[np.ndarray]: | |
| W, H = image_size | |
| det_area = max( | |
| (det_box["xmax"] - det_box["xmin"]) * (det_box["ymax"] - det_box["ymin"]), | |
| 1.0, | |
| ) | |
| best_mask = None | |
| best_quality = -1.0 | |
| for seg in segments: | |
| mask_img = seg["mask"] | |
| mask_np = np.array(mask_img) > 0 | |
| mh, mw = mask_np.shape | |
| if (mw, mh) != (W, H): | |
| mask_np = np.array( | |
| Image.fromarray(mask_np.astype("uint8")).resize((W, H)) | |
| ) > 0 | |
| area = mask_np.sum() | |
| frac = area / float(W * H) | |
| if frac < 0.005 or frac > 0.90: | |
| continue | |
| ys, xs = np.where(mask_np) | |
| if xs.size == 0 or ys.size == 0: | |
| continue | |
| box2 = { | |
| "xmin": float(xs.min()), | |
| "xmax": float(xs.max()), | |
| "ymin": float(ys.min()), | |
| "ymax": float(ys.max()), | |
| } | |
| iou = compute_iou(det_box, box2) | |
| if iou < 0.1: | |
| continue | |
| box2_area = (box2["xmax"] - box2["xmin"]) * (box2["ymax"] - box2["ymin"]) | |
| coverage = box2_area / det_area | |
| if coverage < 0.25 or coverage > 1.5: | |
| continue | |
| score = seg_score(seg) | |
| quality = score + 2.0 * iou | |
| if quality > best_quality: | |
| best_quality = quality | |
| best_mask = mask_np | |
| return best_mask | |
| # ========================== | |
| # Перевод | |
| # ========================== | |
| def translate_en_ru(text: str) -> str: | |
| if not text: | |
| return "" | |
| res = translator_en_ru(text, max_length=512)[0]["translation_text"] | |
| return res | |
| def translate_ru_en(text: str) -> str: | |
| if not text: | |
| return "" | |
| res = translator_ru_en(text, max_length=512)[0]["translation_text"] | |
| return res | |
| # ========================== | |
| # Инференс | |
| # ========================== | |
| def od_predict(pil_image: Image.Image, score_threshold: float) -> Image.Image: | |
| if od_pipe is None: | |
| raise gr.Error("Пайплайн для детекции объектов не инициализирован.") | |
| if pil_image is None: | |
| raise gr.Error("Пожалуйста, загрузите изображение.") | |
| outputs = od_pipe(pil_image) | |
| if not outputs: | |
| return pil_image | |
| def area(det: Dict[str, Any]) -> float: | |
| b = det["box"] | |
| return float((b["xmax"] - b["xmin"]) * (b["ymax"] - b["ymin"])) | |
| best = max(outputs, key=area) | |
| best = dict(best) | |
| best["label"] = "товар" | |
| rendered = render_results_in_image(pil_image, [best], score_threshold=0.0) | |
| return rendered | |
| def sam_predict(pil_image: Image.Image) -> Image.Image: | |
| if segmentation_pipe is None: | |
| raise gr.Error("Пайплайн сегментации не инициализирован.") | |
| if pil_image is None: | |
| raise gr.Error("Пожалуйста, загрузите изображение.") | |
| W, H = pil_image.size | |
| det_box = get_main_detection_box(pil_image) | |
| segments = segmentation_pipe(pil_image) | |
| mask_bool = None | |
| if segments: | |
| mask_bool = choose_product_mask(segments, pil_image.size, det_box) | |
| if mask_bool is None: | |
| # fallback: прямоугольная маска по боксу товара | |
| mask_bool = np.zeros((H, W), dtype=bool) | |
| x1 = int(max(0, min(W - 1, det_box["xmin"]))) | |
| x2 = int(max(0, min(W, det_box["xmax"]))) | |
| y1 = int(max(0, min(H - 1, det_box["ymin"]))) | |
| y2 = int(max(0, min(H, det_box["ymax"]))) | |
| mask_bool[y1:y2, x1:x2] = True | |
| img = show_masks_on_image(pil_image, [mask_bool]) | |
| return img | |
| def cutout_product(pil_image: Image.Image) -> Image.Image: | |
| """ | |
| Вырезает товар для карточки: | |
| 1) Находим бокс товара по YOLOS. | |
| 2) Делаем кроп с небольшим отступом. | |
| 3) Считаем средний цвет четырёх углов кропа как фон. | |
| 4) Все пиксели, близкие к цвету фона, делаем прозрачными. | |
| Остальное считаем товаром. | |
| Такой подход хорошо работает для каталожных фото на светлом фоне. | |
| """ | |
| pil_image = _ensure_rgb(pil_image) | |
| W, H = pil_image.size | |
| det_box = get_main_detection_box(pil_image) | |
| x1 = int(max(0, min(W - 1, det_box["xmin"]))) | |
| x2 = int(max(0, min(W, det_box["xmax"]))) | |
| y1 = int(max(0, min(H - 1, det_box["ymin"]))) | |
| y2 = int(max(0, min(H, det_box["ymax"]))) | |
| # небольшой отступ вокруг товара | |
| pad_x = int(0.05 * (x2 - x1)) | |
| pad_y = int(0.05 * (y2 - y1)) | |
| x1 = max(0, x1 - pad_x) | |
| x2 = min(W, x2 + pad_x) | |
| y1 = max(0, y1 - pad_y) | |
| y2 = min(H, y2 + pad_y) | |
| crop = pil_image.crop((x1, y1, x2, y2)).convert("RGBA") | |
| arr = np.array(crop).astype(np.uint8) | |
| h, w, _ = arr.shape | |
| # фон = средний цвет 4 углов | |
| corners = np.array( | |
| [ | |
| arr[0, 0], | |
| arr[0, w - 1], | |
| arr[h - 1, 0], | |
| arr[h - 1, w - 1], | |
| ], | |
| dtype=np.float32, | |
| ) | |
| bg = corners.mean(axis=0) # [R,G,B,A], но A=255 | |
| diff = np.linalg.norm(arr[..., :3].astype(np.float32) - bg[:3], axis=-1) | |
| bg_brightness = bg[:3].mean() | |
| # если фон очень светлый — жёсткий порог, иначе чуть мягче | |
| thr = 25.0 if bg_brightness > 220 else 40.0 | |
| mask_bg = diff < thr | |
| alpha = np.where(mask_bg, 0, 255).astype(np.uint8) | |
| arr[..., 3] = alpha | |
| return Image.fromarray(arr) | |
| def caption_predict_ru(pil_image: Image.Image) -> str: | |
| if pil_image is None: | |
| raise gr.Error("Пожалуйста, загрузите изображение.") | |
| res = caption_pipe(pil_image, max_new_tokens=50) | |
| caption_en = res[0]["generated_text"] | |
| caption_ru = translate_en_ru(caption_en) | |
| return caption_ru | |
| def retrieval_predict(pil_image: Image.Image, text_en: str) -> float: | |
| if pil_image is None or not text_en: | |
| raise gr.Error("Нужны и картинка, и английский текст.") | |
| inputs = retrieval_processor(images=pil_image, text=text_en, return_tensors="pt") | |
| with torch.no_grad(): | |
| scores = retrieval_model(**inputs)[0] | |
| probs = torch.nn.functional.softmax(scores, dim=1) | |
| prob = float(probs[0][1]) | |
| return prob | |
| def transcribe_audio(filepath: str, language: str) -> str: | |
| if filepath is None: | |
| return "" | |
| if language == "Английский": | |
| result = asr_en(filepath) | |
| return result["text"] | |
| elif language == "Русский": | |
| result = asr_ru( | |
| filepath, | |
| generate_kwargs={"language": "ru", "task": "transcribe"}, | |
| ) | |
| return result["text"] | |
| else: | |
| raise gr.Error("Неподдерживаемый язык распознавания.") | |
| def tts_predict(text: str, language: str) -> str: | |
| if not text or text.strip() == "": | |
| raise gr.Error("Введите текст для озвучки.") | |
| if language == "Английский": | |
| result = tts_en(text) | |
| elif language == "Русский": | |
| result = tts_ru(text) | |
| else: | |
| raise gr.Error("Неподдерживаемый язык синтеза речи.") | |
| audio = result["audio"] | |
| sr = int(result["sampling_rate"]) | |
| if hasattr(audio, "cpu"): | |
| audio = audio.cpu().numpy() | |
| audio = np.asarray(audio, dtype=np.float32).squeeze() | |
| if audio.size == 0: | |
| audio = np.zeros(1, dtype=np.float32) | |
| peak = float(np.max(np.abs(audio))) | |
| if peak < 1e-6: | |
| peak = 1.0 | |
| audio = audio / peak * 0.99 | |
| audio = np.clip(audio, -1.0, 1.0) | |
| audio_int16 = (audio * 32767).astype(np.int16) | |
| filename = f"/tmp/tts_{uuid.uuid4().hex}.wav" | |
| wav_write(filename, sr, audio_int16) | |
| return filename | |
| def nlp_predict(task: str, text: str) -> str: | |
| if not text or text.strip() == "": | |
| raise gr.Error("Введите текст.") | |
| if task == "Английский: анализ тональности": | |
| res = nlp_sentiment_en(text)[0] | |
| return f"Метка: {res['label']}, вероятность: {res['score']:.4f}" | |
| if task == "Русский: анализ тональности": | |
| res = nlp_sentiment_ru(text)[0] | |
| return f"Метка: {res['label']}, вероятность: {res['score']:.4f}" | |
| if task == "Английский: суммаризация": | |
| res = nlp_summarizer_en( | |
| text, | |
| max_length=180, | |
| min_length=60, | |
| do_sample=False, | |
| )[0]["summary_text"] | |
| return res | |
| raise gr.Error("Неизвестная NLP-задача.") | |
| # ========================== | |
| # Хелперы для карточки | |
| # ========================== | |
| def build_title_from_caption(caption_ru: str) -> str: | |
| if not caption_ru: | |
| return "Описание товара" | |
| title = caption_ru.strip().capitalize() | |
| if len(title) > 80: | |
| title = title[:77] + "..." | |
| return title | |
| def build_bullets_from_text(text_ru: str, max_bullets: int = 5) -> str: | |
| if not text_ru: | |
| return "" | |
| sentences = [s.strip() for s in text_ru.split(".") if s.strip()] | |
| bullets = sentences[:max_bullets] | |
| return "\n".join(f"• {b}" for b in bullets) | |
| def build_image_prompt(title_ru: str, bullets_ru: str, theme: str) -> str: | |
| bullets_clean = [line.strip("• ").strip() for line in bullets_ru.split("\n") if line.strip()] | |
| bullets_join = "; ".join(bullets_clean) | |
| theme_ru = { | |
| "Песочный": "тёплый песочный градиентный фон", | |
| "Бирюзовый": "свежий бирюзовый градиентный фон", | |
| "Тёплый коричневый": "тёплый коричнево-золотистый фон", | |
| "Серый": "минималистичный светло-серый фон", | |
| }.get(theme, "современный градиентный фон") | |
| base_ru = ( | |
| "Рекламная карточка товара для маркетплейса, современный минималистичный дизайн, " | |
| f"{theme_ru}, крупное фото товара по центру на прозрачном фоне, " | |
| "аккуратные надписи на русском языке. " | |
| f"Товар: {title_ru}. Основные преимущества: {bullets_join}. " | |
| "Стиль профессиональной промо-карточки для Ozon или Wildberries." | |
| ) | |
| prompt_en = translate_ru_en(base_ru) | |
| return prompt_en | |
| # ========================== | |
| # Шаги пайплайна | |
| # ========================== | |
| def step1_analyze_image(pil_image: Image.Image, det_threshold: float): | |
| if pil_image is None: | |
| raise gr.Error("Загрузите фото товара.") | |
| det_img = od_predict(pil_image, det_threshold) | |
| seg_img = sam_predict(pil_image) | |
| caption_ru = caption_predict_ru(pil_image) | |
| return det_img, seg_img, caption_ru, pil_image, caption_ru | |
| def step2_generate_description( | |
| stored_image: Image.Image, | |
| stored_caption_ru: str, | |
| audio_path: str, | |
| audio_language: str, | |
| extra_features_ru: str, | |
| ): | |
| if stored_image is None: | |
| raise gr.Error("Сначала пройдите Шаг 1 и проанализируйте фото товара.") | |
| if not stored_caption_ru: | |
| stored_caption_ru = caption_predict_ru(stored_image) | |
| transcript_ru = transcribe_audio(audio_path, audio_language) | |
| if not transcript_ru: | |
| transcript_ru = "(нет голосового описания продавца)" | |
| source_ru = ". ".join( | |
| x for x in [stored_caption_ru, transcript_ru, extra_features_ru] if x.strip() | |
| ) | |
| source_en = translate_ru_en(source_ru) | |
| desc_en = nlp_summarizer_en( | |
| source_en, | |
| max_length=220, | |
| min_length=60, | |
| do_sample=False, | |
| )[0]["summary_text"] | |
| desc_ru = translate_en_ru(desc_en) | |
| title_ru = build_title_from_caption(stored_caption_ru) | |
| bullets_ru = build_bullets_from_text(desc_ru, max_bullets=5) | |
| return ( | |
| transcript_ru, | |
| title_ru, | |
| bullets_ru, | |
| desc_ru, | |
| ) | |
| def step3_check_relevance( | |
| stored_image: Image.Image, | |
| stored_caption_ru: str, | |
| description_ru: str, | |
| ): | |
| if stored_image is None or not description_ru: | |
| raise gr.Error("Нужны фото товара (Шаг 1) и описание (Шаг 2).") | |
| caption_ru = stored_caption_ru or caption_predict_ru(stored_image) | |
| caption_en = translate_ru_en(caption_ru) | |
| description_en = translate_ru_en(description_ru) | |
| prob_desc = retrieval_predict(stored_image, description_en) | |
| prob_caption = retrieval_predict(stored_image, caption_en) | |
| relevance_report = ( | |
| "Соответствие изображения и текста (0–1):\n" | |
| f"- Описание карточки vs изображение: {prob_desc:.4f}\n" | |
| f"- Базовая подпись vs изображение: {prob_caption:.4f}" | |
| ) | |
| return relevance_report | |
| def step4_tts( | |
| title_ru: str, | |
| description_ru: str, | |
| tts_language: str, | |
| ): | |
| if not title_ru or not description_ru: | |
| raise gr.Error("Сначала сформируйте заголовок и описание на Шаге 2.") | |
| text_ru = f"{title_ru}. {description_ru}" | |
| if tts_language == "Русский": | |
| tts_text = text_ru | |
| else: | |
| tts_text = translate_ru_en(text_ru) | |
| filepath = tts_predict(tts_text, tts_language) | |
| return filepath | |
| def step5_generate_card_and_prompt( | |
| stored_image: Image.Image, | |
| title_ru: str, | |
| bullets_ru: str, | |
| theme: str, | |
| ) -> tuple[Image.Image, str]: | |
| if stored_image is None: | |
| raise gr.Error("Нужно фото товара (Шаг 1).") | |
| if not title_ru or not bullets_ru: | |
| raise gr.Error("Нужны заголовок и ключевые преимущества (Шаг 2).") | |
| card_w, card_h = 900, 1200 | |
| palettes = { | |
| "Песочный": ((255, 230, 200), (215, 180, 150)), | |
| "Бирюзовый": ((210, 235, 245), (160, 205, 225)), | |
| "Тёплый коричневый": ((215, 170, 130), (165, 115, 80)), | |
| "Серый": ((240, 240, 245), (210, 210, 220)), | |
| } | |
| top_color, bottom_color = palettes.get(theme, ((255, 230, 200), (215, 180, 150))) | |
| bg = create_vertical_gradient((card_w, card_h), top_color, bottom_color) | |
| card = bg.convert("RGBA") | |
| draw = ImageDraw.Draw(card) | |
| product_rgba = cutout_product(stored_image) | |
| max_img_w = int(card_w * 0.6) | |
| max_img_h = int(card_h * 0.5) | |
| w, h = product_rgba.size | |
| scale = min(max_img_w / w, max_img_h / h) | |
| new_w = int(w * scale) | |
| new_h = int(h * scale) | |
| product_resized = product_rgba.resize((new_w, new_h), Image.LANCZOS) | |
| img_x = (card_w - new_w) // 2 | |
| img_y = int(card_h * 0.17) | |
| card.alpha_composite(product_resized, (img_x, img_y)) | |
| text_block_h = int(card_h * 0.4) | |
| text_block_y = card_h - text_block_h - 40 | |
| overlay = Image.new("RGBA", (card_w - 80, text_block_h), (255, 255, 255, 235)) | |
| card.alpha_composite(overlay, (40, text_block_y)) | |
| draw = ImageDraw.Draw(card) | |
| title_font = load_font(46, bold=True) | |
| bullet_font = load_font(26, bold=False) | |
| x_text = 60 | |
| y_text = text_block_y + 24 | |
| title_lines = wrap_text(title_ru, title_font, card_w - 2 * x_text, draw) | |
| for line in title_lines: | |
| draw.text((x_text, y_text), line, font=title_font, fill=(20, 20, 20)) | |
| bbox = draw.textbbox((x_text, y_text), line, font=title_font) | |
| y_text = bbox[3] + 6 | |
| y_text += 8 | |
| bullet_lines = [line.strip("• ").strip() for line in bullets_ru.split("\n") if line.strip()] | |
| bullet_lines = bullet_lines[:5] | |
| max_bullet_width = card_w - 2 * x_text | |
| for bl in bullet_lines: | |
| bullet_text = "• " + bl | |
| wrapped = wrap_text(bullet_text, bullet_font, max_bullet_width, draw) | |
| for wline in wrapped: | |
| draw.text((x_text, y_text), wline, font=bullet_font, fill=(40, 40, 40)) | |
| bbox = draw.textbbox((x_text, y_text), wline, font=bullet_font) | |
| y_text = bbox[3] + 3 | |
| y_text += 4 | |
| prompt_en = build_image_prompt(title_ru, bullets_ru, theme) | |
| final_card = card.convert("RGB") | |
| return final_card, prompt_en | |
| # ========================== | |
| # UI | |
| # ========================== | |
| css = """ | |
| #root .gradio-container { | |
| max-width: 1100px; | |
| margin-left: auto; | |
| margin-right: auto; | |
| } | |
| #root .gradio-container h1, | |
| #root .gradio-container h2, | |
| #root .gradio-container h3 { | |
| text-align: center; | |
| } | |
| """ | |
| def build_step1_tab(state_image, state_caption): | |
| with gr.TabItem("Шаг 1. Анализ фото товара"): | |
| gr.Markdown( | |
| """ | |
| ### Шаг 1. Анализ фото товара | |
| - Загрузите фото товара. | |
| - YOLOS выделит основной объект (bbox «товар»). | |
| - SegFormer (B4) покажет маску товара (для наглядности). | |
| - Будет сгенерирована **русская подпись** к изображению (BLIP + перевод). | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| img_in = gr.Image(type="pil", label="Фото товара") | |
| with gr.Column(): | |
| det_out = gr.Image(label="Обнаруженный товар (bbox)", type="pil") | |
| seg_out = gr.Image(label="Сегментация товара", type="pil") | |
| with gr.Row(): | |
| caption_out = gr.Textbox( | |
| label="Базовая подпись к изображению (RU)", | |
| lines=2, | |
| ) | |
| det_thr = gr.Slider( | |
| 0.0, | |
| 1.0, | |
| value=0.25, | |
| step=0.01, | |
| label="Порог уверенности (для YOLOS почти не важен, выбираем основной объект)", | |
| ) | |
| run = gr.Button("Проанализировать фото") | |
| run.click( | |
| fn=step1_analyze_image, | |
| inputs=[img_in, det_thr], | |
| outputs=[det_out, seg_out, caption_out, state_image, state_caption], | |
| ) | |
| def build_step2_tab( | |
| state_image, | |
| state_caption, | |
| ): | |
| with gr.TabItem("Шаг 2. Описание товара"): | |
| gr.Markdown( | |
| """ | |
| ### Шаг 2. Описание товара | |
| - Надиктуйте голосовое описание (по-русски) — опционально. | |
| - Допишите ключевые характеристики. | |
| - На основе фото, подписи, голоса и характеристик будет сформирована | |
| **карточка товара на русском: Title + bullets + полное описание**. | |
| Поля ниже можно редактировать вручную — изменения пойдут в последующие шаги. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_in = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="Голосовое описание (опционально)", | |
| ) | |
| audio_lang = gr.Radio( | |
| ["Английский", "Русский"], | |
| value="Русский", | |
| label="Язык голосового описания", | |
| ) | |
| extra_features = gr.Textbox( | |
| label="Ключевые характеристики / преимущества (RU)", | |
| lines=4, | |
| placeholder="Материал, размер, цвет, комплектация, гарантия и т.п.", | |
| ) | |
| run = gr.Button("Сформировать описание") | |
| with gr.Column(): | |
| transcript_out = gr.Textbox( | |
| label="Транскрипт голосового описания (RU)", | |
| lines=4, | |
| ) | |
| with gr.Row(): | |
| title_out = gr.Textbox( | |
| label="Заголовок карточки (Title, RU)", | |
| lines=2, | |
| interactive=True, | |
| ) | |
| with gr.Row(): | |
| bullets_out = gr.Textbox( | |
| label="Ключевые преимущества (bullets, RU)", | |
| lines=6, | |
| interactive=True, | |
| ) | |
| description_out = gr.Textbox( | |
| label="Полное описание карточки товара (RU)", | |
| lines=8, | |
| interactive=True, | |
| ) | |
| run.click( | |
| fn=step2_generate_description, | |
| inputs=[state_image, state_caption, audio_in, audio_lang, extra_features], | |
| outputs=[ | |
| transcript_out, | |
| title_out, | |
| bullets_out, | |
| description_out, | |
| ], | |
| ) | |
| return title_out, bullets_out, description_out | |
| def build_step3_tab(state_image, state_caption, description_comp): | |
| with gr.TabItem("Шаг 3. Проверка соответствия"): | |
| gr.Markdown( | |
| """ | |
| ### Шаг 3. Проверка соответствия фото и описания | |
| Внутри текст переводится на английский и прогоняется через BLIP ITM, | |
| но пользователю показываются только числовые оценки. | |
| """ | |
| ) | |
| relevance_out = gr.Textbox( | |
| label="Результат проверки (Image–Text Relevance)", | |
| lines=6, | |
| ) | |
| run = gr.Button("Проверить соответствие") | |
| run.click( | |
| fn=step3_check_relevance, | |
| inputs=[state_image, state_caption, description_comp], | |
| outputs=relevance_out, | |
| ) | |
| def build_step4_tab(title_comp, description_comp): | |
| with gr.TabItem("Шаг 4. Озвучка карточки"): | |
| gr.Markdown( | |
| """ | |
| ### Шаг 4. Озвучка карточки товара | |
| Озвучивается заголовок и полное описание (Bark TTS). | |
| Можно выбрать русский или английский голос. | |
| """ | |
| ) | |
| tts_lang = gr.Radio( | |
| ["Английский", "Русский"], | |
| value="Русский", | |
| label="Язык озвучки", | |
| ) | |
| tts_out = gr.Audio( | |
| label="Аудио-презентация товара", | |
| type="filepath", | |
| ) | |
| run = gr.Button("Сгенерировать озвучку") | |
| run.click( | |
| fn=step4_tts, | |
| inputs=[title_comp, description_comp, tts_lang], | |
| outputs=tts_out, | |
| ) | |
| def build_step5_tab(state_image, title_comp, bullets_comp): | |
| with gr.TabItem("Шаг 5. Карточка товара (изображение)"): | |
| gr.Markdown( | |
| """ | |
| ### Шаг 5. Визуальная карточка товара | |
| - Товар вырезается из фото по цвету фона (без белого прямоугольника). | |
| - Вставляется на градиентный фон. | |
| - Снизу блок с заголовком и bullets. | |
| - Параллельно формируется prompt (EN) для text-to-image моделей. | |
| """ | |
| ) | |
| theme = gr.Radio( | |
| ["Песочный", "Бирюзовый", "Тёплый коричневый", "Серый"], | |
| value="Песочный", | |
| label="Цветовая тема", | |
| ) | |
| card_out = gr.Image( | |
| label="Сгенерированная карточка товара", | |
| type="pil", | |
| ) | |
| prompt_out = gr.Textbox( | |
| label="Prompt для генеративной модели (EN)", | |
| lines=6, | |
| ) | |
| run = gr.Button("Сгенерировать карточку") | |
| run.click( | |
| fn=step5_generate_card_and_prompt, | |
| inputs=[state_image, title_comp, bullets_comp, theme], | |
| outputs=[card_out, prompt_out], | |
| ) | |
| def build_app() -> gr.Blocks: | |
| with gr.Blocks( | |
| css=css, | |
| title="Marketplace Multimodal Hub (RU)", | |
| theme=gr.themes.Soft(), | |
| ) as demo: | |
| gr.Markdown( | |
| """ | |
| # Marketplace Multimodal Hub (RU) | |
| Пошаговый пайплайн создания **русской** карточки товара для маркетплейса | |
| на основе фото, голосового описания и текстовых характеристик. | |
| """ | |
| ) | |
| state_image = gr.State() | |
| state_caption = gr.State() | |
| with gr.Tabs(): | |
| build_step1_tab(state_image, state_caption) | |
| title_comp, bullets_comp, description_comp = build_step2_tab( | |
| state_image, | |
| state_caption, | |
| ) | |
| build_step3_tab(state_image, state_caption, description_comp) | |
| build_step4_tab(title_comp, description_comp) | |
| build_step5_tab(state_image, title_comp, bullets_comp) | |
| return demo | |
| app = build_app() | |
| if __name__ == "__main__": | |
| app.launch() | |