Spaces:

ZennyKenny
/

Novoyaz

Sleeping

App Files Files Community

Novoyaz / app.py

ZennyKenny

Update app.py

4b69a9f verified 2 months ago

raw

history blame contribute delete

11.4 kB

	import os
	from pathlib import Path

	import gradio as gr
	import torch
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	AutoProcessor,
	Qwen2_5_VLForConditionalGeneration,
	)
	from peft import PeftModel
	import spaces


	# ========= Config =========
	MODEL_ID_BASE = os.getenv("BASE_MODEL_ID", "openai/gpt-oss-20b")
	ADAPTER_REPO = os.getenv("ADAPTER_REPO", "ZennyKenny/oss-20b-prereform-to-modern-ru-merged")
	ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER", "checkpoint-60")

	OCR_MODEL_ID = os.getenv("OCR_MODEL_ID", "ChatDOC/OCRFlux-3B")

	# High token budgets (hidden)
	OCR_MAX_NEW_TOKENS = int(os.getenv("OCR_MAX_NEW_TOKENS", "6000"))
	CONVERT_MAX_NEW_TOKENS = int(os.getenv("CONVERT_MAX_NEW_TOKENS", "6000"))

	# Fixed generation knobs (hidden)
	TEMPERATURE = float(os.getenv("CONVERT_TEMPERATURE", "0.2"))
	TOP_P = float(os.getenv("CONVERT_TOP_P", "0.9"))
	TOP_K = int(os.getenv("CONVERT_TOP_K", "40"))
	REPETITION_PENALTY = float(os.getenv("CONVERT_REP_PENALTY", "1.05"))


	# ========= Load prompts =========
	def _load_system_prompt():
	path = Path(__file__).with_name("text-prompt.py")
	default = (
	"Ты компетентный редактор русского языка. "
	"Преобразуй дореформенную русскую орфографию (до 1918 года) "
	"в современную орфографию. Сохраняй смысл, пунктуацию и регистр. "
	"Не добавляй комментариев. Верни только преобразованный текст."
	)
	try:
	ns = {}
	if path.exists():
	exec(path.read_text(encoding="utf-8"), ns)
	return ns.get("SYSTEM_PROMPT", default)
	except Exception:
	return default

	def _load_ocr_prompt():
	path = Path(__file__).with_name("ocr-prompt.py")
	default = (
	"Извлеки из изображения весь текст БУКВАЛЬНО и на русском языке. "
	"Ничего не переводить и не исправлять. "
	"Сохраняй дореформенную орфографию и специальные символы. "
	"Верни только чистый текст (plain text)."
	)
	try:
	ns = {}
	if path.exists():
	exec(path.read_text(encoding="utf-8"), ns)
	return ns.get("OCR_PROMPT", default)
	except Exception:
	return default

	SYSTEM_PROMPT = _load_system_prompt()
	OCR_PROMPT = _load_ocr_prompt()


	def build_conversion_prompt(pre_reform_text: str) -> str:
	return (
	f"{SYSTEM_PROMPT}\n\n"
	f"Текст (дореформ.):\n{pre_reform_text.strip()}\n\n"
	f"Текст (современная орфография):"
	)


	@spaces.GPU()
	def _ocr_image_to_text(image) -> str:
	"""
	Use OCRFlux-3B via chat templating. We create role/content messages,
	apply the chat template, process image + text, and generate.
	"""
	# Load processor/model
	processor = AutoProcessor.from_pretrained(OCR_MODEL_ID, trust_remote_code=True)

	torch_dtype = (
	torch.bfloat16
	if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
	else torch.float16
	)
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	OCR_MODEL_ID,
	trust_remote_code=True,
	torch_dtype=torch_dtype,
	device_map="auto",
	)

	# Chat messages per Qwen2.5-VL format
	# System gives OCR instructions; user provides the image only.
	messages = [
	{
	"role": "system",
	"content": [{"type": "text", "text": OCR_PROMPT}],
	},
	{
	"role": "user",
	"content": [{"type": "image", "image": image}],
	},
	]

	# Turn messages into a chat-formatted string
	chat_text = processor.apply_chat_template(
	messages, add_generation_prompt=True, tokenize=False
	)

	# Prepare tensors (text+image) for the model
	inputs = processor(
	text=[chat_text],
	images=[image],
	return_tensors="pt",
	)

	# Move to the model device
	for k, v in inputs.items():
	if isinstance(v, torch.Tensor):
	inputs[k] = v.to(model.device)

	# Generate deterministically for OCR
	with torch.no_grad():
	output_ids = model.generate(
	**inputs,
	max_new_tokens=OCR_MAX_NEW_TOKENS,
	temperature=0.0,
	do_sample=False,
	use_cache=True,
	)

	# Decode only the continuation (exclude prompt tokens if present)
	# When using processor(...), tokenizer is inside processor.
	tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else None
	if tokenizer is None:
	# Fallback to model.tokenizer if processor has no tokenizer attribute
	tokenizer = AutoTokenizer.from_pretrained(OCR_MODEL_ID, trust_remote_code=True)

	# Best-effort continuation-only decode
	# (Some VL processors don’t expose input_ids directly; handle both cases.)
	if "input_ids" in inputs:
	prompt_len = inputs["input_ids"].shape[1]
	cont = output_ids[0, prompt_len:]
	text = tokenizer.decode(cont, skip_special_tokens=True).strip()
	else:
	# Full decode if we cannot easily slice by prompt length
	text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()

	return text


	# ========= ZeroGPU: Conversion step (LoRA applied) =========
	@spaces.GPU(duration=300) # 5 minutes
	def _convert_text_zerogpu(pre_reform_text: str) -> str:
	tokenizer = AutoTokenizer.from_pretrained(
	ADAPTER_REPO, use_fast=True, trust_remote_code=True
	)
	if tokenizer.pad_token_id is None:
	tokenizer.pad_token = tokenizer.eos_token

	torch_dtype = (
	torch.bfloat16
	if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
	else torch.float16
	)
	base = AutoModelForCausalLM.from_pretrained(
	MODEL_ID_BASE,
	trust_remote_code=True,
	torch_dtype=torch_dtype,
	device_map="auto",
	)

	model = PeftModel.from_pretrained(
	base, ADAPTER_REPO, subfolder=ADAPTER_SUBFOLDER
	)
	try:
	model = model.merge_and_unload()
	except Exception:
	pass

	try:
	model.config.pad_token_id = tokenizer.pad_token_id
	except Exception:
	pass

	prompt = build_conversion_prompt(pre_reform_text)
	enc = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
	input_ids = enc["input_ids"].to(model.device)
	attention_mask = enc.get("attention_mask", torch.ones_like(input_ids)).to(
	model.device
	)

	gen_kwargs = dict(
	max_new_tokens=CONVERT_MAX_NEW_TOKENS,
	temperature=TEMPERATURE,
	top_p=TOP_P,
	top_k=TOP_K,
	repetition_penalty=REPETITION_PENALTY,
	do_sample=True,
	use_cache=True,
	)

	with torch.no_grad():
	out_ids = model.generate(
	input_ids=input_ids,
	attention_mask=attention_mask,
	**gen_kwargs,
	)

	continuation = out_ids[0, input_ids.shape[1] :]
	out = tokenizer.decode(continuation, skip_special_tokens=True).strip()

	if not out:
	full = tokenizer.decode(out_ids[0], skip_special_tokens=True).strip()
	marker = "Текст (современная орфография):"
	out = full.split(marker, 1)[-1].strip() if marker in full else full

	return out


	# ========= Orchestrator =========
	def process(image, manual_text):
	pre_reform_from_ocr = ""
	if image is not None:
	pre_reform_from_ocr = _ocr_image_to_text(image)

	combined = ""
	if manual_text and manual_text.strip():
	combined = manual_text.strip()
	if pre_reform_from_ocr:
	combined = (combined + "\n\n" + pre_reform_from_ocr).strip() if combined else pre_reform_from_ocr

	if not combined:
	return "", ""

	modern_text = _convert_text_zerogpu(combined)
	return modern_text, pre_reform_from_ocr


	# ========= UI =========
	with gr.Blocks(css="style.css") as novoyaz:
	gr.Markdown(
	"""
	# Новояз — преобразование дореформенной орфографии в современную
	"""
	)
	gr.Image(
	value="https://i.ibb.co/JWWws0SK/image.png",
	show_label=False,
	height=400,
	width=400,
	interactive=False,
	elem_id="novoyaz-logo",
	)
	gr.Markdown(
	"""
	Загрузите изображение со старой русской орфографией (дореформенной) или вставьте такой текст вручную — получите результат в современной орфографии. Без лишних комментариев, с сохранением смысла и пунктуации.

	## Техническая информация
	Внутри используются две открытые модели:
	- OCR для извлечения текста: [ChatDOC/OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B)
	- Преобразование орфографии: [openai/gpt-oss-20b](https://huggingface.co/openai/gpt-oss-20b) + [ZennyKenny/oss-20b-prereform-to-modern-ru-merged](https://huggingface.co/ZennyKenny/oss-20b-prereform-to-modern-ru-merged)

	Запросы исполняются на ZeroGPU. Все модели — с открытым исходным кодом.

	## Инструкция по использованию
	1. Загрузите изображение (PNG/JPG) или вставьте дореформенный текст вручную.
	2. Можно совместить оба варианта — текст будет объединён перед преобразованием.
	3. Нажмите «Распознать и преобразовать» и получите современную орфографию.
	4. Проверяйте «Промежуточный текст из OCR» для сверки.
	"""
	)

	with gr.Row():
	with gr.Column():
	img = gr.Image(label="Изображение с дореформенным текстом", type="pil")
	manual = gr.Textbox(
	label="(Необязательно) Вставьте дореформенный текст вручную",
	lines=10,
	placeholder="Например: \"въ мирѣ сёмъ многа есть...\"",
	)
	btn = gr.Button("Распознать и преобразовать", variant="primary")
	with gr.Column():
	out_modern = gr.Textbox(label="Современная орфография (результат)", lines=18)
	with gr.Accordion("Промежуточный текст из OCR (для проверки)", open=False):
	out_ocr = gr.Textbox(label="Текст из OCRFlux-3B", lines=12)

	btn.click(
	fn=process,
	inputs=[img, manual],
	outputs=[out_modern, out_ocr],
	api_name="process",
	)

	if __name__ == "__main__":
	os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
	novoyaz.queue().launch()