Spaces:
Sleeping
Sleeping
| import os | |
| from pathlib import Path | |
| import gradio as gr | |
| import torch | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForCausalLM, | |
| AutoProcessor, | |
| Qwen2_5_VLForConditionalGeneration, | |
| ) | |
| from peft import PeftModel | |
| import spaces | |
| # ========= Config ========= | |
| MODEL_ID_BASE = os.getenv("BASE_MODEL_ID", "openai/gpt-oss-20b") | |
| ADAPTER_REPO = os.getenv("ADAPTER_REPO", "ZennyKenny/oss-20b-prereform-to-modern-ru-merged") | |
| ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER", "checkpoint-60") | |
| OCR_MODEL_ID = os.getenv("OCR_MODEL_ID", "ChatDOC/OCRFlux-3B") | |
| # High token budgets (hidden) | |
| OCR_MAX_NEW_TOKENS = int(os.getenv("OCR_MAX_NEW_TOKENS", "6000")) | |
| CONVERT_MAX_NEW_TOKENS = int(os.getenv("CONVERT_MAX_NEW_TOKENS", "6000")) | |
| # Fixed generation knobs (hidden) | |
| TEMPERATURE = float(os.getenv("CONVERT_TEMPERATURE", "0.2")) | |
| TOP_P = float(os.getenv("CONVERT_TOP_P", "0.9")) | |
| TOP_K = int(os.getenv("CONVERT_TOP_K", "40")) | |
| REPETITION_PENALTY = float(os.getenv("CONVERT_REP_PENALTY", "1.05")) | |
| # ========= Load prompts ========= | |
| def _load_system_prompt(): | |
| path = Path(__file__).with_name("text-prompt.py") | |
| default = ( | |
| "Ты компетентный редактор русского языка. " | |
| "Преобразуй дореформенную русскую орфографию (до 1918 года) " | |
| "в современную орфографию. Сохраняй смысл, пунктуацию и регистр. " | |
| "Не добавляй комментариев. Верни только преобразованный текст." | |
| ) | |
| try: | |
| ns = {} | |
| if path.exists(): | |
| exec(path.read_text(encoding="utf-8"), ns) | |
| return ns.get("SYSTEM_PROMPT", default) | |
| except Exception: | |
| return default | |
| def _load_ocr_prompt(): | |
| path = Path(__file__).with_name("ocr-prompt.py") | |
| default = ( | |
| "Извлеки из изображения весь текст БУКВАЛЬНО и на русском языке. " | |
| "Ничего не переводить и не исправлять. " | |
| "Сохраняй дореформенную орфографию и специальные символы. " | |
| "Верни только чистый текст (plain text)." | |
| ) | |
| try: | |
| ns = {} | |
| if path.exists(): | |
| exec(path.read_text(encoding="utf-8"), ns) | |
| return ns.get("OCR_PROMPT", default) | |
| except Exception: | |
| return default | |
| SYSTEM_PROMPT = _load_system_prompt() | |
| OCR_PROMPT = _load_ocr_prompt() | |
| def build_conversion_prompt(pre_reform_text: str) -> str: | |
| return ( | |
| f"{SYSTEM_PROMPT}\n\n" | |
| f"Текст (дореформ.):\n{pre_reform_text.strip()}\n\n" | |
| f"Текст (современная орфография):" | |
| ) | |
| def _ocr_image_to_text(image) -> str: | |
| """ | |
| Use OCRFlux-3B via chat templating. We create role/content messages, | |
| apply the chat template, process image + text, and generate. | |
| """ | |
| # Load processor/model | |
| processor = AutoProcessor.from_pretrained(OCR_MODEL_ID, trust_remote_code=True) | |
| torch_dtype = ( | |
| torch.bfloat16 | |
| if torch.cuda.is_available() and torch.cuda.is_bf16_supported() | |
| else torch.float16 | |
| ) | |
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| OCR_MODEL_ID, | |
| trust_remote_code=True, | |
| torch_dtype=torch_dtype, | |
| device_map="auto", | |
| ) | |
| # Chat messages per Qwen2.5-VL format | |
| # System gives OCR instructions; user provides the image only. | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": [{"type": "text", "text": OCR_PROMPT}], | |
| }, | |
| { | |
| "role": "user", | |
| "content": [{"type": "image", "image": image}], | |
| }, | |
| ] | |
| # Turn messages into a chat-formatted string | |
| chat_text = processor.apply_chat_template( | |
| messages, add_generation_prompt=True, tokenize=False | |
| ) | |
| # Prepare tensors (text+image) for the model | |
| inputs = processor( | |
| text=[chat_text], | |
| images=[image], | |
| return_tensors="pt", | |
| ) | |
| # Move to the model device | |
| for k, v in inputs.items(): | |
| if isinstance(v, torch.Tensor): | |
| inputs[k] = v.to(model.device) | |
| # Generate deterministically for OCR | |
| with torch.no_grad(): | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=OCR_MAX_NEW_TOKENS, | |
| temperature=0.0, | |
| do_sample=False, | |
| use_cache=True, | |
| ) | |
| # Decode only the continuation (exclude prompt tokens if present) | |
| # When using processor(...), tokenizer is inside processor. | |
| tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else None | |
| if tokenizer is None: | |
| # Fallback to model.tokenizer if processor has no tokenizer attribute | |
| tokenizer = AutoTokenizer.from_pretrained(OCR_MODEL_ID, trust_remote_code=True) | |
| # Best-effort continuation-only decode | |
| # (Some VL processors don’t expose input_ids directly; handle both cases.) | |
| if "input_ids" in inputs: | |
| prompt_len = inputs["input_ids"].shape[1] | |
| cont = output_ids[0, prompt_len:] | |
| text = tokenizer.decode(cont, skip_special_tokens=True).strip() | |
| else: | |
| # Full decode if we cannot easily slice by prompt length | |
| text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip() | |
| return text | |
| # ========= ZeroGPU: Conversion step (LoRA applied) ========= | |
| # 5 minutes | |
| def _convert_text_zerogpu(pre_reform_text: str) -> str: | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| ADAPTER_REPO, use_fast=True, trust_remote_code=True | |
| ) | |
| if tokenizer.pad_token_id is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| torch_dtype = ( | |
| torch.bfloat16 | |
| if torch.cuda.is_available() and torch.cuda.is_bf16_supported() | |
| else torch.float16 | |
| ) | |
| base = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID_BASE, | |
| trust_remote_code=True, | |
| torch_dtype=torch_dtype, | |
| device_map="auto", | |
| ) | |
| model = PeftModel.from_pretrained( | |
| base, ADAPTER_REPO, subfolder=ADAPTER_SUBFOLDER | |
| ) | |
| try: | |
| model = model.merge_and_unload() | |
| except Exception: | |
| pass | |
| try: | |
| model.config.pad_token_id = tokenizer.pad_token_id | |
| except Exception: | |
| pass | |
| prompt = build_conversion_prompt(pre_reform_text) | |
| enc = tokenizer(prompt, return_tensors="pt", add_special_tokens=False) | |
| input_ids = enc["input_ids"].to(model.device) | |
| attention_mask = enc.get("attention_mask", torch.ones_like(input_ids)).to( | |
| model.device | |
| ) | |
| gen_kwargs = dict( | |
| max_new_tokens=CONVERT_MAX_NEW_TOKENS, | |
| temperature=TEMPERATURE, | |
| top_p=TOP_P, | |
| top_k=TOP_K, | |
| repetition_penalty=REPETITION_PENALTY, | |
| do_sample=True, | |
| use_cache=True, | |
| ) | |
| with torch.no_grad(): | |
| out_ids = model.generate( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask, | |
| **gen_kwargs, | |
| ) | |
| continuation = out_ids[0, input_ids.shape[1] :] | |
| out = tokenizer.decode(continuation, skip_special_tokens=True).strip() | |
| if not out: | |
| full = tokenizer.decode(out_ids[0], skip_special_tokens=True).strip() | |
| marker = "Текст (современная орфография):" | |
| out = full.split(marker, 1)[-1].strip() if marker in full else full | |
| return out | |
| # ========= Orchestrator ========= | |
| def process(image, manual_text): | |
| pre_reform_from_ocr = "" | |
| if image is not None: | |
| pre_reform_from_ocr = _ocr_image_to_text(image) | |
| combined = "" | |
| if manual_text and manual_text.strip(): | |
| combined = manual_text.strip() | |
| if pre_reform_from_ocr: | |
| combined = (combined + "\n\n" + pre_reform_from_ocr).strip() if combined else pre_reform_from_ocr | |
| if not combined: | |
| return "", "" | |
| modern_text = _convert_text_zerogpu(combined) | |
| return modern_text, pre_reform_from_ocr | |
| # ========= UI ========= | |
| with gr.Blocks(css="style.css") as novoyaz: | |
| gr.Markdown( | |
| """ | |
| # Новояз — преобразование дореформенной орфографии в современную | |
| """ | |
| ) | |
| gr.Image( | |
| value="https://i.ibb.co/JWWws0SK/image.png", | |
| show_label=False, | |
| height=400, | |
| width=400, | |
| interactive=False, | |
| elem_id="novoyaz-logo", | |
| ) | |
| gr.Markdown( | |
| """ | |
| Загрузите изображение со старой русской орфографией (дореформенной) **или** вставьте такой текст вручную — получите результат в **современной орфографии**. Без лишних комментариев, с сохранением смысла и пунктуации. | |
| ## Техническая информация | |
| Внутри используются две открытые модели: | |
| - **OCR для извлечения текста**: [ChatDOC/OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) | |
| - **Преобразование орфографии**: [openai/gpt-oss-20b](https://huggingface.co/openai/gpt-oss-20b) + [ZennyKenny/oss-20b-prereform-to-modern-ru-merged](https://huggingface.co/ZennyKenny/oss-20b-prereform-to-modern-ru-merged) | |
| Запросы исполняются на **ZeroGPU**. Все модели — **с открытым исходным кодом**. | |
| ## Инструкция по использованию | |
| 1. Загрузите изображение (PNG/JPG) или вставьте дореформенный текст вручную. | |
| 2. Можно совместить оба варианта — текст будет объединён перед преобразованием. | |
| 3. Нажмите **«Распознать и преобразовать»** и получите современную орфографию. | |
| 4. Проверяйте «Промежуточный текст из OCR» для сверки. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| img = gr.Image(label="Изображение с дореформенным текстом", type="pil") | |
| manual = gr.Textbox( | |
| label="(Необязательно) Вставьте дореформенный текст вручную", | |
| lines=10, | |
| placeholder="Например: \"въ мирѣ сёмъ многа есть...\"", | |
| ) | |
| btn = gr.Button("Распознать и преобразовать", variant="primary") | |
| with gr.Column(): | |
| out_modern = gr.Textbox(label="Современная орфография (результат)", lines=18) | |
| with gr.Accordion("Промежуточный текст из OCR (для проверки)", open=False): | |
| out_ocr = gr.Textbox(label="Текст из OCRFlux-3B", lines=12) | |
| btn.click( | |
| fn=process, | |
| inputs=[img, manual], | |
| outputs=[out_modern, out_ocr], | |
| api_name="process", | |
| ) | |
| if __name__ == "__main__": | |
| os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1") | |
| novoyaz.queue().launch() | |