Spaces:

eternalGenius
/

strcp-course-kononov

Sleeping

App Files Files Community

strcp-course-kononov / app.py

eternalGenius

Update app.py

1bf3ef4 verified 7 days ago

raw

history blame

33.3 kB

	from __future__ import annotations

	import uuid
	from typing import List, Dict, Any, Optional, Tuple

	import sys
	import subprocess

	# ==========================
	# Установка sentencepiece для моделей перевода Helsinki-NLP
	# ==========================
	try:
	import sentencepiece # noqa
	except ImportError:
	subprocess.run(
	[sys.executable, "-m", "pip", "install", "sentencepiece"],
	check=False,
	)

	import gradio as gr
	import numpy as np
	import torch
	from PIL import Image, ImageDraw, ImageFont
	from scipy.io.wavfile import write as wav_write
	from transformers import (
	pipeline,
	BlipForImageTextRetrieval,
	AutoProcessor,
	)
	from transformers.utils import logging as hf_logging

	hf_logging.set_verbosity_error()

	# ==========================
	# Вспомогательные функции
	# ==========================


	def _ensure_rgb(img: Image.Image) -> Image.Image:
	if img.mode != "RGB":
	return img.convert("RGB")
	return img


	def load_font(size: int, bold: bool = False) -> ImageFont.FreeTypeFont \| ImageFont.ImageFont:
	paths = []
	if bold:
	paths.append("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf")
	paths.append("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf")

	for p in paths:
	try:
	return ImageFont.truetype(p, size=size)
	except Exception:
	continue
	return ImageFont.load_default()


	def wrap_text(text: str, font: ImageFont.ImageFont, max_width: int, draw: ImageDraw.ImageDraw) -> list[str]:
	words = text.split()
	lines = []
	current = []
	for w in words:
	test_line = " ".join(current + [w])
	bbox = draw.textbbox((0, 0), test_line, font=font)
	if bbox[2] - bbox[0] <= max_width or not current:
	current.append(w)
	else:
	lines.append(" ".join(current))
	current = [w]
	if current:
	lines.append(" ".join(current))
	return lines


	def create_vertical_gradient(size: tuple[int, int], top_color: tuple[int, int, int],
	bottom_color: tuple[int, int, int]) -> Image.Image:
	w, h = size
	gradient = Image.new("RGB", (1, h))
	top_r, top_g, top_b = top_color
	bot_r, bot_g, bot_b = bottom_color
	for y in range(h):
	alpha = y / max(h - 1, 1)
	r = int(top_r * (1 - alpha) + bot_r * alpha)
	g = int(top_g * (1 - alpha) + bot_g * alpha)
	b = int(top_b * (1 - alpha) + bot_b * alpha)
	gradient.putpixel((0, y), (r, g, b))
	gradient = gradient.resize((w, h))
	return gradient


	def render_results_in_image(
	pil_image: Image.Image,
	detections: List[Dict[str, Any]],
	score_threshold: float = 0.5,
	) -> Image.Image:
	img = _ensure_rgb(pil_image).copy()
	draw = ImageDraw.Draw(img)

	W, H = img.size
	font = ImageFont.load_default()

	for det in detections:
	score = float(det.get("score", 0.0))
	if score < score_threshold:
	continue
	lbl = str(det.get("label", ""))
	box = det.get("box", {})
	x1 = box.get("xmin", 0)
	y1 = box.get("ymin", 0)
	x2 = box.get("xmax", 0)
	y2 = box.get("ymax", 0)

	x1 = max(0, min(W, x1))
	x2 = max(0, min(W, x2))
	y1 = max(0, min(H, y1))
	y2 = max(0, min(H, y2))

	draw.rectangle([(x1, y1), (x2, y2)], outline=(0, 255, 0), width=3)
	text = f"{lbl} {score:.2f}"
	bbox = draw.textbbox((0, 0), text, font=font)
	tw = bbox[2] - bbox[0]
	th = bbox[3] - bbox[1]
	pad = 2
	tx2 = min(x1 + tw + 2 * pad, W)
	ty2 = min(y1 + th + 2 * pad, H)
	draw.rectangle([(x1, y1), (tx2, ty2)], fill=(0, 255, 0))
	draw.text((x1 + pad, y1 + pad), text, fill=(0, 0, 0), font=font)

	return img


	def seg_score(seg: Dict[str, Any]) -> float:
	s = seg.get("score", 0.0)
	try:
	return float(s)
	except (TypeError, ValueError):
	return 0.0


	def show_masks_on_image(pil_image: Image.Image, masks: list[np.ndarray]) -> Image.Image:
	img = pil_image.convert("RGBA")
	overlay = Image.new("RGBA", img.size)
	for mask_np in masks:
	mask_uint8 = (mask_np * 255).astype("uint8")
	mask_img = Image.fromarray(mask_uint8).resize(img.size).convert("L")
	color = (255, 0, 0, 100)
	colored = Image.new("RGBA", img.size, color)
	overlay = Image.composite(colored, overlay, mask_img)
	combined = Image.alpha_composite(img, overlay)
	return combined


	# ==========================
	# Модели и пайплайны
	# ==========================

	od_pipe = pipeline("object-detection", model="hustvl/yolos-tiny")

	segmentation_pipe = pipeline(
	task="image-segmentation",
	model="nvidia/segformer-b4-finetuned-ade-512-512",
	)

	retrieval_model_name = "Salesforce/blip-itm-base-coco"
	retrieval_model = BlipForImageTextRetrieval.from_pretrained(retrieval_model_name)
	retrieval_processor = AutoProcessor.from_pretrained(retrieval_model_name)

	caption_pipe = pipeline(
	"image-to-text",
	model="Salesforce/blip-image-captioning-large",
	)

	asr_en = pipeline(
	task="automatic-speech-recognition",
	model="openai/whisper-tiny.en",
	)
	asr_ru = pipeline(
	task="automatic-speech-recognition",
	model="openai/whisper-tiny",
	)

	tts_en = pipeline(
	task="text-to-speech",
	model="suno/bark-small",
	)
	tts_ru = pipeline(
	task="text-to-speech",
	model="suno/bark-small",
	)

	nlp_sentiment_en = pipeline(
	task="sentiment-analysis",
	model="distilbert-base-uncased-finetuned-sst-2-english",
	)
	nlp_sentiment_ru = pipeline(
	task="sentiment-analysis",
	model="blanchefort/rubert-base-cased-sentiment",
	)
	nlp_summarizer_en = pipeline(
	task="summarization",
	model="facebook/bart-large-cnn",
	)

	translator_en_ru = pipeline(
	"translation",
	model="Helsinki-NLP/opus-mt-en-ru",
	)
	translator_ru_en = pipeline(
	"translation",
	model="Helsinki-NLP/opus-mt-ru-en",
	)


	# ==========================
	# Детекция / сегментация
	# ==========================

	def get_main_detection_box(pil_image: Image.Image) -> Dict[str, float]:
	W, H = pil_image.size
	outputs = od_pipe(pil_image)
	if not outputs:
	return {"xmin": 0, "ymin": 0, "xmax": W, "ymax": H}

	def area(det: Dict[str, Any]) -> float:
	b = det["box"]
	return float((b["xmax"] - b["xmin"]) * (b["ymax"] - b["ymin"]))

	best = max(outputs, key=area)
	return best["box"]


	def compute_iou(box1: Dict[str, float], box2: Dict[str, float]) -> float:
	x1 = max(box1["xmin"], box2["xmin"])
	y1 = max(box1["ymin"], box2["ymin"])
	x2 = min(box1["xmax"], box2["xmax"])
	y2 = min(box1["ymax"], box2["ymax"])
	w = max(0.0, x2 - x1)
	h = max(0.0, y2 - y1)
	inter = w * h
	if inter == 0:
	return 0.0
	a1 = (box1["xmax"] - box1["xmin"]) * (box1["ymax"] - box1["ymin"])
	a2 = (box2["xmax"] - box2["xmin"]) * (box2["ymax"] - box2["ymin"])
	union = a1 + a2 - inter
	if union <= 0:
	return 0.0
	return inter / union


	def choose_product_mask(
	segments: List[Dict[str, Any]],
	image_size: Tuple[int, int],
	det_box: Dict[str, float],
	) -> Optional[np.ndarray]:
	W, H = image_size
	det_area = max(
	(det_box["xmax"] - det_box["xmin"]) * (det_box["ymax"] - det_box["ymin"]),
	1.0,
	)

	best_mask = None
	best_quality = -1.0

	for seg in segments:
	mask_img = seg["mask"]
	mask_np = np.array(mask_img) > 0
	mh, mw = mask_np.shape
	if (mw, mh) != (W, H):
	mask_np = np.array(
	Image.fromarray(mask_np.astype("uint8")).resize((W, H))
	) > 0

	area = mask_np.sum()
	frac = area / float(W * H)
	if frac < 0.005 or frac > 0.90:
	continue

	ys, xs = np.where(mask_np)
	if xs.size == 0 or ys.size == 0:
	continue

	box2 = {
	"xmin": float(xs.min()),
	"xmax": float(xs.max()),
	"ymin": float(ys.min()),
	"ymax": float(ys.max()),
	}

	iou = compute_iou(det_box, box2)
	if iou < 0.1:
	continue

	box2_area = (box2["xmax"] - box2["xmin"]) * (box2["ymax"] - box2["ymin"])
	coverage = box2_area / det_area
	if coverage < 0.25 or coverage > 1.5:
	continue

	score = seg_score(seg)
	quality = score + 2.0 * iou

	if quality > best_quality:
	best_quality = quality
	best_mask = mask_np

	return best_mask


	# ==========================
	# Перевод
	# ==========================

	def translate_en_ru(text: str) -> str:
	if not text:
	return ""
	res = translator_en_ru(text, max_length=512)[0]["translation_text"]
	return res


	def translate_ru_en(text: str) -> str:
	if not text:
	return ""
	res = translator_ru_en(text, max_length=512)[0]["translation_text"]
	return res


	# ==========================
	# Инференс
	# ==========================

	def od_predict(pil_image: Image.Image, score_threshold: float) -> Image.Image:
	if od_pipe is None:
	raise gr.Error("Пайплайн для детекции объектов не инициализирован.")
	if pil_image is None:
	raise gr.Error("Пожалуйста, загрузите изображение.")

	outputs = od_pipe(pil_image)
	if not outputs:
	return pil_image

	def area(det: Dict[str, Any]) -> float:
	b = det["box"]
	return float((b["xmax"] - b["xmin"]) * (b["ymax"] - b["ymin"]))

	best = max(outputs, key=area)
	best = dict(best)
	best["label"] = "товар"

	rendered = render_results_in_image(pil_image, [best], score_threshold=0.0)
	return rendered


	def sam_predict(pil_image: Image.Image) -> Image.Image:
	if segmentation_pipe is None:
	raise gr.Error("Пайплайн сегментации не инициализирован.")
	if pil_image is None:
	raise gr.Error("Пожалуйста, загрузите изображение.")

	W, H = pil_image.size
	det_box = get_main_detection_box(pil_image)

	segments = segmentation_pipe(pil_image)
	mask_bool = None
	if segments:
	mask_bool = choose_product_mask(segments, pil_image.size, det_box)

	if mask_bool is None:
	# fallback: прямоугольная маска по боксу товара
	mask_bool = np.zeros((H, W), dtype=bool)
	x1 = int(max(0, min(W - 1, det_box["xmin"])))
	x2 = int(max(0, min(W, det_box["xmax"])))
	y1 = int(max(0, min(H - 1, det_box["ymin"])))
	y2 = int(max(0, min(H, det_box["ymax"])))
	mask_bool[y1:y2, x1:x2] = True

	img = show_masks_on_image(pil_image, [mask_bool])
	return img


	def cutout_product(pil_image: Image.Image) -> Image.Image:
	"""
	Вырезает товар для карточки:
	1) Находим бокс товара по YOLOS.
	2) Делаем кроп с небольшим отступом.
	3) Считаем средний цвет четырёх углов кропа как фон.
	4) Все пиксели, близкие к цвету фона, делаем прозрачными.
	Остальное считаем товаром.
	Такой подход хорошо работает для каталожных фото на светлом фоне.
	"""
	pil_image = _ensure_rgb(pil_image)
	W, H = pil_image.size
	det_box = get_main_detection_box(pil_image)

	x1 = int(max(0, min(W - 1, det_box["xmin"])))
	x2 = int(max(0, min(W, det_box["xmax"])))
	y1 = int(max(0, min(H - 1, det_box["ymin"])))
	y2 = int(max(0, min(H, det_box["ymax"])))

	# небольшой отступ вокруг товара
	pad_x = int(0.05 * (x2 - x1))
	pad_y = int(0.05 * (y2 - y1))
	x1 = max(0, x1 - pad_x)
	x2 = min(W, x2 + pad_x)
	y1 = max(0, y1 - pad_y)
	y2 = min(H, y2 + pad_y)

	crop = pil_image.crop((x1, y1, x2, y2)).convert("RGBA")
	arr = np.array(crop).astype(np.uint8)
	h, w, _ = arr.shape

	# фон = средний цвет 4 углов
	corners = np.array(
	[
	arr[0, 0],
	arr[0, w - 1],
	arr[h - 1, 0],
	arr[h - 1, w - 1],
	],
	dtype=np.float32,
	)
	bg = corners.mean(axis=0) # [R,G,B,A], но A=255

	diff = np.linalg.norm(arr[..., :3].astype(np.float32) - bg[:3], axis=-1)
	bg_brightness = bg[:3].mean()

	# если фон очень светлый — жёсткий порог, иначе чуть мягче
	thr = 25.0 if bg_brightness > 220 else 40.0
	mask_bg = diff < thr

	alpha = np.where(mask_bg, 0, 255).astype(np.uint8)
	arr[..., 3] = alpha

	return Image.fromarray(arr)


	def caption_predict_ru(pil_image: Image.Image) -> str:
	if pil_image is None:
	raise gr.Error("Пожалуйста, загрузите изображение.")
	res = caption_pipe(pil_image, max_new_tokens=50)
	caption_en = res[0]["generated_text"]
	caption_ru = translate_en_ru(caption_en)
	return caption_ru


	def retrieval_predict(pil_image: Image.Image, text_en: str) -> float:
	if pil_image is None or not text_en:
	raise gr.Error("Нужны и картинка, и английский текст.")
	inputs = retrieval_processor(images=pil_image, text=text_en, return_tensors="pt")
	with torch.no_grad():
	scores = retrieval_model(**inputs)[0]
	probs = torch.nn.functional.softmax(scores, dim=1)
	prob = float(probs[0][1])
	return prob


	def transcribe_audio(filepath: str, language: str) -> str:
	if filepath is None:
	return ""

	if language == "Английский":
	result = asr_en(filepath)
	return result["text"]
	elif language == "Русский":
	result = asr_ru(
	filepath,
	generate_kwargs={"language": "ru", "task": "transcribe"},
	)
	return result["text"]
	else:
	raise gr.Error("Неподдерживаемый язык распознавания.")


	def tts_predict(text: str, language: str) -> str:
	if not text or text.strip() == "":
	raise gr.Error("Введите текст для озвучки.")

	if language == "Английский":
	result = tts_en(text)
	elif language == "Русский":
	result = tts_ru(text)
	else:
	raise gr.Error("Неподдерживаемый язык синтеза речи.")

	audio = result["audio"]
	sr = int(result["sampling_rate"])

	if hasattr(audio, "cpu"):
	audio = audio.cpu().numpy()
	audio = np.asarray(audio, dtype=np.float32).squeeze()

	if audio.size == 0:
	audio = np.zeros(1, dtype=np.float32)

	peak = float(np.max(np.abs(audio)))
	if peak < 1e-6:
	peak = 1.0
	audio = audio / peak * 0.99
	audio = np.clip(audio, -1.0, 1.0)

	audio_int16 = (audio * 32767).astype(np.int16)

	filename = f"/tmp/tts_{uuid.uuid4().hex}.wav"
	wav_write(filename, sr, audio_int16)

	return filename


	def nlp_predict(task: str, text: str) -> str:
	if not text or text.strip() == "":
	raise gr.Error("Введите текст.")

	if task == "Английский: анализ тональности":
	res = nlp_sentiment_en(text)[0]
	return f"Метка: {res['label']}, вероятность: {res['score']:.4f}"

	if task == "Русский: анализ тональности":
	res = nlp_sentiment_ru(text)[0]
	return f"Метка: {res['label']}, вероятность: {res['score']:.4f}"

	if task == "Английский: суммаризация":
	res = nlp_summarizer_en(
	text,
	max_length=180,
	min_length=60,
	do_sample=False,
	)[0]["summary_text"]
	return res

	raise gr.Error("Неизвестная NLP-задача.")


	# ==========================
	# Хелперы для карточки
	# ==========================

	def build_title_from_caption(caption_ru: str) -> str:
	if not caption_ru:
	return "Описание товара"
	title = caption_ru.strip().capitalize()
	if len(title) > 80:
	title = title[:77] + "..."
	return title


	def build_bullets_from_text(text_ru: str, max_bullets: int = 5) -> str:
	if not text_ru:
	return ""
	sentences = [s.strip() for s in text_ru.split(".") if s.strip()]
	bullets = sentences[:max_bullets]
	return "\n".join(f"• {b}" for b in bullets)


	def build_image_prompt(title_ru: str, bullets_ru: str, theme: str) -> str:
	bullets_clean = [line.strip("• ").strip() for line in bullets_ru.split("\n") if line.strip()]
	bullets_join = "; ".join(bullets_clean)

	theme_ru = {
	"Песочный": "тёплый песочный градиентный фон",
	"Бирюзовый": "свежий бирюзовый градиентный фон",
	"Тёплый коричневый": "тёплый коричнево-золотистый фон",
	"Серый": "минималистичный светло-серый фон",
	}.get(theme, "современный градиентный фон")

	base_ru = (
	"Рекламная карточка товара для маркетплейса, современный минималистичный дизайн, "
	f"{theme_ru}, крупное фото товара по центру на прозрачном фоне, "
	"аккуратные надписи на русском языке. "
	f"Товар: {title_ru}. Основные преимущества: {bullets_join}. "
	"Стиль профессиональной промо-карточки для Ozon или Wildberries."
	)
	prompt_en = translate_ru_en(base_ru)
	return prompt_en


	# ==========================
	# Шаги пайплайна
	# ==========================

	def step1_analyze_image(pil_image: Image.Image, det_threshold: float):
	if pil_image is None:
	raise gr.Error("Загрузите фото товара.")

	det_img = od_predict(pil_image, det_threshold)
	seg_img = sam_predict(pil_image)
	caption_ru = caption_predict_ru(pil_image)

	return det_img, seg_img, caption_ru, pil_image, caption_ru


	def step2_generate_description(
	stored_image: Image.Image,
	stored_caption_ru: str,
	audio_path: str,
	audio_language: str,
	extra_features_ru: str,
	):
	if stored_image is None:
	raise gr.Error("Сначала пройдите Шаг 1 и проанализируйте фото товара.")

	if not stored_caption_ru:
	stored_caption_ru = caption_predict_ru(stored_image)

	transcript_ru = transcribe_audio(audio_path, audio_language)
	if not transcript_ru:
	transcript_ru = "(нет голосового описания продавца)"

	source_ru = ". ".join(
	x for x in [stored_caption_ru, transcript_ru, extra_features_ru] if x.strip()
	)

	source_en = translate_ru_en(source_ru)

	desc_en = nlp_summarizer_en(
	source_en,
	max_length=220,
	min_length=60,
	do_sample=False,
	)[0]["summary_text"]

	desc_ru = translate_en_ru(desc_en)

	title_ru = build_title_from_caption(stored_caption_ru)
	bullets_ru = build_bullets_from_text(desc_ru, max_bullets=5)

	return (
	transcript_ru,
	title_ru,
	bullets_ru,
	desc_ru,
	)


	def step3_check_relevance(
	stored_image: Image.Image,
	stored_caption_ru: str,
	description_ru: str,
	):
	if stored_image is None or not description_ru:
	raise gr.Error("Нужны фото товара (Шаг 1) и описание (Шаг 2).")

	caption_ru = stored_caption_ru or caption_predict_ru(stored_image)

	caption_en = translate_ru_en(caption_ru)
	description_en = translate_ru_en(description_ru)

	prob_desc = retrieval_predict(stored_image, description_en)
	prob_caption = retrieval_predict(stored_image, caption_en)

	relevance_report = (
	"Соответствие изображения и текста (0–1):\n"
	f"- Описание карточки vs изображение: {prob_desc:.4f}\n"
	f"- Базовая подпись vs изображение: {prob_caption:.4f}"
	)
	return relevance_report


	def step4_tts(
	title_ru: str,
	description_ru: str,
	tts_language: str,
	):
	if not title_ru or not description_ru:
	raise gr.Error("Сначала сформируйте заголовок и описание на Шаге 2.")

	text_ru = f"{title_ru}. {description_ru}"

	if tts_language == "Русский":
	tts_text = text_ru
	else:
	tts_text = translate_ru_en(text_ru)

	filepath = tts_predict(tts_text, tts_language)
	return filepath


	def step5_generate_card_and_prompt(
	stored_image: Image.Image,
	title_ru: str,
	bullets_ru: str,
	theme: str,
	) -> tuple[Image.Image, str]:
	if stored_image is None:
	raise gr.Error("Нужно фото товара (Шаг 1).")
	if not title_ru or not bullets_ru:
	raise gr.Error("Нужны заголовок и ключевые преимущества (Шаг 2).")

	card_w, card_h = 900, 1200

	palettes = {
	"Песочный": ((255, 230, 200), (215, 180, 150)),
	"Бирюзовый": ((210, 235, 245), (160, 205, 225)),
	"Тёплый коричневый": ((215, 170, 130), (165, 115, 80)),
	"Серый": ((240, 240, 245), (210, 210, 220)),
	}
	top_color, bottom_color = palettes.get(theme, ((255, 230, 200), (215, 180, 150)))

	bg = create_vertical_gradient((card_w, card_h), top_color, bottom_color)
	card = bg.convert("RGBA")
	draw = ImageDraw.Draw(card)

	product_rgba = cutout_product(stored_image)

	max_img_w = int(card_w * 0.6)
	max_img_h = int(card_h * 0.5)
	w, h = product_rgba.size
	scale = min(max_img_w / w, max_img_h / h)
	new_w = int(w * scale)
	new_h = int(h * scale)
	product_resized = product_rgba.resize((new_w, new_h), Image.LANCZOS)

	img_x = (card_w - new_w) // 2
	img_y = int(card_h * 0.17)
	card.alpha_composite(product_resized, (img_x, img_y))

	text_block_h = int(card_h * 0.4)
	text_block_y = card_h - text_block_h - 40
	overlay = Image.new("RGBA", (card_w - 80, text_block_h), (255, 255, 255, 235))
	card.alpha_composite(overlay, (40, text_block_y))

	draw = ImageDraw.Draw(card)
	title_font = load_font(46, bold=True)
	bullet_font = load_font(26, bold=False)

	x_text = 60
	y_text = text_block_y + 24

	title_lines = wrap_text(title_ru, title_font, card_w - 2 * x_text, draw)
	for line in title_lines:
	draw.text((x_text, y_text), line, font=title_font, fill=(20, 20, 20))
	bbox = draw.textbbox((x_text, y_text), line, font=title_font)
	y_text = bbox[3] + 6

	y_text += 8

	bullet_lines = [line.strip("• ").strip() for line in bullets_ru.split("\n") if line.strip()]
	bullet_lines = bullet_lines[:5]
	max_bullet_width = card_w - 2 * x_text

	for bl in bullet_lines:
	bullet_text = "• " + bl
	wrapped = wrap_text(bullet_text, bullet_font, max_bullet_width, draw)
	for wline in wrapped:
	draw.text((x_text, y_text), wline, font=bullet_font, fill=(40, 40, 40))
	bbox = draw.textbbox((x_text, y_text), wline, font=bullet_font)
	y_text = bbox[3] + 3
	y_text += 4

	prompt_en = build_image_prompt(title_ru, bullets_ru, theme)

	final_card = card.convert("RGB")
	return final_card, prompt_en


	# ==========================
	# UI
	# ==========================

	css = """
	#root .gradio-container {
	max-width: 1100px;
	margin-left: auto;
	margin-right: auto;
	}
	#root .gradio-container h1,
	#root .gradio-container h2,
	#root .gradio-container h3 {
	text-align: center;
	}
	"""


	def build_step1_tab(state_image, state_caption):
	with gr.TabItem("Шаг 1. Анализ фото товара"):
	gr.Markdown(
	"""
	### Шаг 1. Анализ фото товара

	- Загрузите фото товара.
	- YOLOS выделит основной объект (bbox «товар»).
	- SegFormer (B4) покажет маску товара (для наглядности).
	- Будет сгенерирована русская подпись к изображению (BLIP + перевод).
	"""
	)
	with gr.Row():
	with gr.Column():
	img_in = gr.Image(type="pil", label="Фото товара")
	with gr.Column():
	det_out = gr.Image(label="Обнаруженный товар (bbox)", type="pil")
	seg_out = gr.Image(label="Сегментация товара", type="pil")
	with gr.Row():
	caption_out = gr.Textbox(
	label="Базовая подпись к изображению (RU)",
	lines=2,
	)
	det_thr = gr.Slider(
	0.0,
	1.0,
	value=0.25,
	step=0.01,
	label="Порог уверенности (для YOLOS почти не важен, выбираем основной объект)",
	)
	run = gr.Button("Проанализировать фото")

	run.click(
	fn=step1_analyze_image,
	inputs=[img_in, det_thr],
	outputs=[det_out, seg_out, caption_out, state_image, state_caption],
	)


	def build_step2_tab(
	state_image,
	state_caption,
	):
	with gr.TabItem("Шаг 2. Описание товара"):
	gr.Markdown(
	"""
	### Шаг 2. Описание товара

	- Надиктуйте голосовое описание (по-русски) — опционально.
	- Допишите ключевые характеристики.
	- На основе фото, подписи, голоса и характеристик будет сформирована
	карточка товара на русском: Title + bullets + полное описание.

	Поля ниже можно редактировать вручную — изменения пойдут в последующие шаги.
	"""
	)
	with gr.Row():
	with gr.Column():
	audio_in = gr.Audio(
	sources=["microphone", "upload"],
	type="filepath",
	label="Голосовое описание (опционально)",
	)
	audio_lang = gr.Radio(
	["Английский", "Русский"],
	value="Русский",
	label="Язык голосового описания",
	)
	extra_features = gr.Textbox(
	label="Ключевые характеристики / преимущества (RU)",
	lines=4,
	placeholder="Материал, размер, цвет, комплектация, гарантия и т.п.",
	)
	run = gr.Button("Сформировать описание")
	with gr.Column():
	transcript_out = gr.Textbox(
	label="Транскрипт голосового описания (RU)",
	lines=4,
	)
	with gr.Row():
	title_out = gr.Textbox(
	label="Заголовок карточки (Title, RU)",
	lines=2,
	interactive=True,
	)
	with gr.Row():
	bullets_out = gr.Textbox(
	label="Ключевые преимущества (bullets, RU)",
	lines=6,
	interactive=True,
	)
	description_out = gr.Textbox(
	label="Полное описание карточки товара (RU)",
	lines=8,
	interactive=True,
	)

	run.click(
	fn=step2_generate_description,
	inputs=[state_image, state_caption, audio_in, audio_lang, extra_features],
	outputs=[
	transcript_out,
	title_out,
	bullets_out,
	description_out,
	],
	)

	return title_out, bullets_out, description_out


	def build_step3_tab(state_image, state_caption, description_comp):
	with gr.TabItem("Шаг 3. Проверка соответствия"):
	gr.Markdown(
	"""
	### Шаг 3. Проверка соответствия фото и описания

	Внутри текст переводится на английский и прогоняется через BLIP ITM,
	но пользователю показываются только числовые оценки.
	"""
	)
	relevance_out = gr.Textbox(
	label="Результат проверки (Image–Text Relevance)",
	lines=6,
	)
	run = gr.Button("Проверить соответствие")

	run.click(
	fn=step3_check_relevance,
	inputs=[state_image, state_caption, description_comp],
	outputs=relevance_out,
	)


	def build_step4_tab(title_comp, description_comp):
	with gr.TabItem("Шаг 4. Озвучка карточки"):
	gr.Markdown(
	"""
	### Шаг 4. Озвучка карточки товара

	Озвучивается заголовок и полное описание (Bark TTS).
	Можно выбрать русский или английский голос.
	"""
	)
	tts_lang = gr.Radio(
	["Английский", "Русский"],
	value="Русский",
	label="Язык озвучки",
	)
	tts_out = gr.Audio(
	label="Аудио-презентация товара",
	type="filepath",
	)
	run = gr.Button("Сгенерировать озвучку")

	run.click(
	fn=step4_tts,
	inputs=[title_comp, description_comp, tts_lang],
	outputs=tts_out,
	)


	def build_step5_tab(state_image, title_comp, bullets_comp):
	with gr.TabItem("Шаг 5. Карточка товара (изображение)"):
	gr.Markdown(
	"""
	### Шаг 5. Визуальная карточка товара

	- Товар вырезается из фото по цвету фона (без белого прямоугольника).
	- Вставляется на градиентный фон.
	- Снизу блок с заголовком и bullets.
	- Параллельно формируется prompt (EN) для text-to-image моделей.
	"""
	)
	theme = gr.Radio(
	["Песочный", "Бирюзовый", "Тёплый коричневый", "Серый"],
	value="Песочный",
	label="Цветовая тема",
	)
	card_out = gr.Image(
	label="Сгенерированная карточка товара",
	type="pil",
	)
	prompt_out = gr.Textbox(
	label="Prompt для генеративной модели (EN)",
	lines=6,
	)
	run = gr.Button("Сгенерировать карточку")

	run.click(
	fn=step5_generate_card_and_prompt,
	inputs=[state_image, title_comp, bullets_comp, theme],
	outputs=[card_out, prompt_out],
	)


	def build_app() -> gr.Blocks:
	with gr.Blocks(
	css=css,
	title="Marketplace Multimodal Hub (RU)",
	theme=gr.themes.Soft(),
	) as demo:
	gr.Markdown(
	"""
	# Marketplace Multimodal Hub (RU)

	Пошаговый пайплайн создания русской карточки товара для маркетплейса
	на основе фото, голосового описания и текстовых характеристик.
	"""
	)

	state_image = gr.State()
	state_caption = gr.State()

	with gr.Tabs():
	build_step1_tab(state_image, state_caption)
	title_comp, bullets_comp, description_comp = build_step2_tab(
	state_image,
	state_caption,
	)
	build_step3_tab(state_image, state_caption, description_comp)
	build_step4_tab(title_comp, description_comp)
	build_step5_tab(state_image, title_comp, bullets_comp)

	return demo


	app = build_app()

	if __name__ == "__main__":
	app.launch()