import os
from pathlib import Path

import gradio as gr
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoProcessor,
    Qwen2_5_VLForConditionalGeneration,
)
from peft import PeftModel
import spaces


# ========= Config =========
MODEL_ID_BASE = os.getenv("BASE_MODEL_ID", "openai/gpt-oss-20b")
ADAPTER_REPO = os.getenv("ADAPTER_REPO", "ZennyKenny/oss-20b-prereform-to-modern-ru-merged")
ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER", "checkpoint-60")

OCR_MODEL_ID = os.getenv("OCR_MODEL_ID", "ChatDOC/OCRFlux-3B")

# High token budgets (hidden)
OCR_MAX_NEW_TOKENS = int(os.getenv("OCR_MAX_NEW_TOKENS", "6000"))
CONVERT_MAX_NEW_TOKENS = int(os.getenv("CONVERT_MAX_NEW_TOKENS", "6000"))

# Fixed generation knobs (hidden)
TEMPERATURE = float(os.getenv("CONVERT_TEMPERATURE", "0.2"))
TOP_P = float(os.getenv("CONVERT_TOP_P", "0.9"))
TOP_K = int(os.getenv("CONVERT_TOP_K", "40"))
REPETITION_PENALTY = float(os.getenv("CONVERT_REP_PENALTY", "1.05"))


# ========= Load prompts =========
def _load_system_prompt():
    path = Path(__file__).with_name("text-prompt.py")
    default = (
        "Ты компетентный редактор русского языка. "
        "Преобразуй дореформенную русскую орфографию (до 1918 года) "
        "в современную орфографию. Сохраняй смысл, пунктуацию и регистр. "
        "Не добавляй комментариев. Верни только преобразованный текст."
    )
    try:
        ns = {}
        if path.exists():
            exec(path.read_text(encoding="utf-8"), ns)
        return ns.get("SYSTEM_PROMPT", default)
    except Exception:
        return default

def _load_ocr_prompt():
    path = Path(__file__).with_name("ocr-prompt.py")
    default = (
        "Извлеки из изображения весь текст БУКВАЛЬНО и на русском языке. "
        "Ничего не переводить и не исправлять. "
        "Сохраняй дореформенную орфографию и специальные символы. "
        "Верни только чистый текст (plain text)."
    )
    try:
        ns = {}
        if path.exists():
            exec(path.read_text(encoding="utf-8"), ns)
        return ns.get("OCR_PROMPT", default)
    except Exception:
        return default

SYSTEM_PROMPT = _load_system_prompt()
OCR_PROMPT = _load_ocr_prompt()


def build_conversion_prompt(pre_reform_text: str) -> str:
    return (
        f"{SYSTEM_PROMPT}\n\n"
        f"Текст (дореформ.):\n{pre_reform_text.strip()}\n\n"
        f"Текст (современная орфография):"
    )


@spaces.GPU()
def _ocr_image_to_text(image) -> str:
    """
    Use OCRFlux-3B via chat templating. We create role/content messages,
    apply the chat template, process image + text, and generate.
    """
    # Load processor/model
    processor = AutoProcessor.from_pretrained(OCR_MODEL_ID, trust_remote_code=True)

    torch_dtype = (
        torch.bfloat16
        if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
        else torch.float16
    )
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        OCR_MODEL_ID,
        trust_remote_code=True,
        torch_dtype=torch_dtype,
        device_map="auto",
    )

    # Chat messages per Qwen2.5-VL format
    # System gives OCR instructions; user provides the image only.
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": OCR_PROMPT}],
        },
        {
            "role": "user",
            "content": [{"type": "image", "image": image}],
        },
    ]

    # Turn messages into a chat-formatted string
    chat_text = processor.apply_chat_template(
        messages, add_generation_prompt=True, tokenize=False
    )

    # Prepare tensors (text+image) for the model
    inputs = processor(
        text=[chat_text],
        images=[image],
        return_tensors="pt",
    )

    # Move to the model device
    for k, v in inputs.items():
        if isinstance(v, torch.Tensor):
            inputs[k] = v.to(model.device)

    # Generate deterministically for OCR
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=OCR_MAX_NEW_TOKENS,
            temperature=0.0,
            do_sample=False,
            use_cache=True,
        )

    # Decode only the continuation (exclude prompt tokens if present)
    # When using processor(...), tokenizer is inside processor.
    tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else None
    if tokenizer is None:
        # Fallback to model.tokenizer if processor has no tokenizer attribute
        tokenizer = AutoTokenizer.from_pretrained(OCR_MODEL_ID, trust_remote_code=True)

    # Best-effort continuation-only decode
    # (Some VL processors don’t expose input_ids directly; handle both cases.)
    if "input_ids" in inputs:
        prompt_len = inputs["input_ids"].shape[1]
        cont = output_ids[0, prompt_len:]
        text = tokenizer.decode(cont, skip_special_tokens=True).strip()
    else:
        # Full decode if we cannot easily slice by prompt length
        text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()

    return text


# ========= ZeroGPU: Conversion step (LoRA applied) =========
@spaces.GPU(duration=300)  # 5 minutes
def _convert_text_zerogpu(pre_reform_text: str) -> str:
    tokenizer = AutoTokenizer.from_pretrained(
        ADAPTER_REPO, use_fast=True, trust_remote_code=True
    )
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token

    torch_dtype = (
        torch.bfloat16
        if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
        else torch.float16
    )
    base = AutoModelForCausalLM.from_pretrained(
        MODEL_ID_BASE,
        trust_remote_code=True,
        torch_dtype=torch_dtype,
        device_map="auto",
    )

    model = PeftModel.from_pretrained(
        base, ADAPTER_REPO, subfolder=ADAPTER_SUBFOLDER
    )
    try:
        model = model.merge_and_unload()
    except Exception:
        pass

    try:
        model.config.pad_token_id = tokenizer.pad_token_id
    except Exception:
        pass

    prompt = build_conversion_prompt(pre_reform_text)
    enc = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
    input_ids = enc["input_ids"].to(model.device)
    attention_mask = enc.get("attention_mask", torch.ones_like(input_ids)).to(
        model.device
    )

    gen_kwargs = dict(
        max_new_tokens=CONVERT_MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        top_p=TOP_P,
        top_k=TOP_K,
        repetition_penalty=REPETITION_PENALTY,
        do_sample=True,
        use_cache=True,
    )

    with torch.no_grad():
        out_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            **gen_kwargs,
        )

    continuation = out_ids[0, input_ids.shape[1] :]
    out = tokenizer.decode(continuation, skip_special_tokens=True).strip()

    if not out:
        full = tokenizer.decode(out_ids[0], skip_special_tokens=True).strip()
        marker = "Текст (современная орфография):"
        out = full.split(marker, 1)[-1].strip() if marker in full else full

    return out


# ========= Orchestrator =========
def process(image, manual_text):
    pre_reform_from_ocr = ""
    if image is not None:
        pre_reform_from_ocr = _ocr_image_to_text(image)

    combined = ""
    if manual_text and manual_text.strip():
        combined = manual_text.strip()
    if pre_reform_from_ocr:
        combined = (combined + "\n\n" + pre_reform_from_ocr).strip() if combined else pre_reform_from_ocr

    if not combined:
        return "", ""

    modern_text = _convert_text_zerogpu(combined)
    return modern_text, pre_reform_from_ocr


# ========= UI =========
with gr.Blocks(css="style.css") as novoyaz:
    gr.Markdown(
        """
        # Новояз — преобразование дореформенной орфографии в современную
        """
    )
    gr.Image(
        value="https://i.ibb.co/JWWws0SK/image.png",
        show_label=False,
        height=400,
        width=400,
        interactive=False,
        elem_id="novoyaz-logo",
    )
    gr.Markdown(
        """
        Загрузите изображение со старой русской орфографией (дореформенной) **или** вставьте такой текст вручную — получите результат в **современной орфографии**. Без лишних комментариев, с сохранением смысла и пунктуации.

        ## Техническая информация
        Внутри используются две открытые модели:
        - **OCR для извлечения текста**: [ChatDOC/OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B)  
        - **Преобразование орфографии**: [openai/gpt-oss-20b](https://huggingface.co/openai/gpt-oss-20b) + [ZennyKenny/oss-20b-prereform-to-modern-ru-merged](https://huggingface.co/ZennyKenny/oss-20b-prereform-to-modern-ru-merged)

        Запросы исполняются на **ZeroGPU**. Все модели — **с открытым исходным кодом**.

        ## Инструкция по использованию
        1. Загрузите изображение (PNG/JPG) или вставьте дореформенный текст вручную.
        2. Можно совместить оба варианта — текст будет объединён перед преобразованием.
        3. Нажмите **«Распознать и преобразовать»** и получите современную орфографию.
        4. Проверяйте «Промежуточный текст из OCR» для сверки.
        """
    )

    with gr.Row():
        with gr.Column():
            img = gr.Image(label="Изображение с дореформенным текстом", type="pil")
            manual = gr.Textbox(
                label="(Необязательно) Вставьте дореформенный текст вручную",
                lines=10,
                placeholder="Например: \"въ мирѣ сёмъ многа есть...\"",
            )
            btn = gr.Button("Распознать и преобразовать", variant="primary")
        with gr.Column():
            out_modern = gr.Textbox(label="Современная орфография (результат)", lines=18)
            with gr.Accordion("Промежуточный текст из OCR (для проверки)", open=False):
                out_ocr = gr.Textbox(label="Текст из OCRFlux-3B", lines=12)

    btn.click(
        fn=process,
        inputs=[img, manual],
        outputs=[out_modern, out_ocr],
        api_name="process",
    )

if __name__ == "__main__":
    os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
    novoyaz.queue().launch()