Spaces:

MCP-1st-Birthday
/

gdrive-pdf-translator-mcp

Running

File size: 11,615 Bytes

92a0b42

"""

BabelDOC with Agentic AI - Modal Deployment



PDF translation API with layout preservation.

20-page limit during test phase.



Setup:

    modal secret create babeldocs-secrets \

      NEBIUS_API_KEY=your_key \

      NEBIUS_API_BASE=https://api.tokenfactory.nebius.com/v1/ \

      NEBIUS_TRANSLATION_MODEL=openai/gpt-oss-120b



Deploy:

    modal deploy modal_deploy.py

"""

import modal
import os
from pathlib import Path

THIS_DIR = Path(__file__).parent.resolve()
BABELDOC_DIR = THIS_DIR.parent / "BabelDOC"

# Max pages allowed (test phase limit)
MAX_PAGES = 20

# Modal app - custom name for hackathon
app = modal.App("mcp1stann-babeldocs")

# Image with uv and BabelDOC installed
babeldocs_image = (
    modal.Image.debian_slim(python_version="3.11")
    .apt_install(
        "git",
        "libgl1-mesa-glx",
        "libglib2.0-0",
        "libsm6",
        "libxext6",
        "libxrender-dev",
        "libgomp1",
        "curl",
        "libspatialindex-dev",  # For rtree
        "libharfbuzz-dev",  # For uharfbuzz
        "libfreetype6-dev",  # For freetype-py
        "libopencv-dev",  # For opencv dependencies
        "libzstd-dev",  # For pyzstd
    )
    .pip_install("uv")
    .env({
        "PYTHONIOENCODING": "utf-8",
        "PYTHONUNBUFFERED": "1",
        "UV_SYSTEM_PYTHON": "1",
    })
    .pip_install("fastapi[standard]")
    .add_local_dir(
        str(BABELDOC_DIR),
        remote_path="/app/BabelDOC",
        copy=True,
    )
    .run_commands(
        "cd /app/BabelDOC && uv pip install -e . --python python3.11",
    )
)

# Volume for caching models and fonts
cache_volume = modal.Volume.from_name("babeldocs-cache", create_if_missing=True)
CACHE_PATH = "/cache"


@app.cls(

    image=babeldocs_image,

    timeout=900,  # 15 minutes

    memory=8192,

    cpu=4,

    volumes={CACHE_PATH: cache_volume},

    secrets=[modal.Secret.from_name("babeldocs-secrets")],

    scaledown_window=300,  # Keep warm for 5 minutes

)
class BabelDocsTranslator:
    """Class-based translator for BabelDOC (based on working SVG generator pattern)."""

    def _count_pdf_pages(self, pdf_bytes: bytes) -> int:
        """Count pages in PDF using PyMuPDF."""
        try:
            import fitz  # PyMuPDF
            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
            count = len(doc)
            doc.close()
            return count
        except Exception:
            return -1  # Unknown

    def _translate_internal(

        self,

        pdf_base64: str,

        target_lang: str = "fr",

        pages: str = "",

        no_dual: bool = False,

        no_mono: bool = False,

    ) -> dict:
        """BabelDOC with Agentic AI - Internal translation."""
        import base64
        import subprocess
        import tempfile
        from pathlib import Path
        from datetime import datetime

        try:
            if not pdf_base64:
                return {"success": False, "message": "No PDF provided"}

            pdf_bytes = base64.b64decode(pdf_base64)

            # Check page limit (test phase)
            page_count = self._count_pdf_pages(pdf_bytes)
            if page_count > MAX_PAGES:
                return {
                    "success": False,
                    "message": f"PDF has {page_count} pages. Maximum allowed: {MAX_PAGES} pages (test phase limit)."
                }

            with tempfile.TemporaryDirectory() as tmpdir:
                input_path = Path(tmpdir) / "input.pdf"
                output_dir = Path(tmpdir) / "output"
                output_dir.mkdir()

                input_path.write_bytes(pdf_bytes)

                cmd = [
                    "babeldoc",
                    "--files", str(input_path),
                    "--output", str(output_dir),
                    "--lang-out", target_lang,
                    "--openai",
                    "--openai-model", os.getenv("NEBIUS_TRANSLATION_MODEL", "openai/gpt-oss-120b"),
                    "--openai-base-url", os.getenv("NEBIUS_API_BASE", "https://api.tokenfactory.nebius.com/v1/"),
                    "--openai-api-key", os.getenv("NEBIUS_API_KEY", ""),
                    "--no-watermark",
                    "--translate-table-text",
                    "--enhance-compatibility",
                    # Enable image translation (orchestration PASS 2) with vision model
                    "--vision-model", os.getenv("NEBIUS_VISION_MODEL", "Qwen/Qwen2.5-VL-72B-Instruct"),
                ]

                if pages:
                    cmd.extend(["--pages", pages])
                    cmd.append("--only-include-translated-page")

                if no_dual:
                    cmd.append("--no-dual")

                if no_mono:
                    cmd.append("--no-mono")

                start_time = datetime.now()

                result = subprocess.run(
                    cmd,
                    capture_output=True,
                    text=True,
                    encoding="utf-8",
                    errors="replace",
                    cwd="/app/BabelDOC",
                    env={
                        **os.environ,
                        "HF_HOME": CACHE_PATH,
                    },
                )

                duration = (datetime.now() - start_time).total_seconds()

                if result.returncode != 0:
                    return {
                        "success": False,
                        "message": "Translation failed",
                        "stderr": result.stderr[:1000] if result.stderr else "",
                        "stdout": result.stdout[:500] if result.stdout else "",
                    }

                # Find all 4 types of PDFs:
                # Format: name.no_watermark.{lang}.{mono|dual}.pdf
                # Format: name.no_watermark.{lang}.{mono|dual}.images_translated.pdf

                # Get all PDFs in output directory
                all_pdfs = list(output_dir.glob("*.pdf"))

                # Categorize by type
                mono_matches = [p for p in all_pdfs if f".{target_lang}.mono.pdf" in p.name and "images_translated" not in p.name]
                mono_img_matches = [p for p in all_pdfs if f".{target_lang}.mono.images_translated.pdf" in p.name]
                dual_matches = [p for p in all_pdfs if f".{target_lang}.dual.pdf" in p.name and "images_translated" not in p.name]
                dual_img_matches = [p for p in all_pdfs if f".{target_lang}.dual.images_translated.pdf" in p.name]

                mono_pdf = mono_matches[0] if mono_matches else None
                mono_img_pdf = mono_img_matches[0] if mono_img_matches else None
                dual_pdf = dual_matches[0] if dual_matches else None
                dual_img_pdf = dual_img_matches[0] if dual_img_matches else None

                if not any([mono_pdf, mono_img_pdf, dual_pdf, dual_img_pdf]):
                    # Fallback to any PDF
                    if not all_pdfs:
                        return {"success": False, "message": "No output PDF generated"}
                    mono_pdf = all_pdfs[0]

                result_data = {
                    "success": True,
                    "stats": {
                        "duration_seconds": round(duration, 2),
                    }
                }

                # Add mono PDF (without image translation)
                if mono_pdf and not no_mono:
                    mono_bytes = mono_pdf.read_bytes()
                    result_data["mono_pdf_base64"] = base64.b64encode(mono_bytes).decode("utf-8")
                    result_data["mono_filename"] = mono_pdf.name
                    result_data["stats"]["mono_size_bytes"] = len(mono_bytes)

                # Add mono PDF with image translation
                if mono_img_pdf and not no_mono:
                    mono_img_bytes = mono_img_pdf.read_bytes()
                    result_data["mono_img_pdf_base64"] = base64.b64encode(mono_img_bytes).decode("utf-8")
                    result_data["mono_img_filename"] = mono_img_pdf.name
                    result_data["stats"]["mono_img_size_bytes"] = len(mono_img_bytes)

                # Add dual PDF (without image translation)
                if dual_pdf and not no_dual:
                    dual_bytes = dual_pdf.read_bytes()
                    result_data["dual_pdf_base64"] = base64.b64encode(dual_bytes).decode("utf-8")
                    result_data["dual_filename"] = dual_pdf.name
                    result_data["stats"]["dual_size_bytes"] = len(dual_bytes)

                # Add dual PDF with image translation
                if dual_img_pdf and not no_dual:
                    dual_img_bytes = dual_img_pdf.read_bytes()
                    result_data["dual_img_pdf_base64"] = base64.b64encode(dual_img_bytes).decode("utf-8")
                    result_data["dual_img_filename"] = dual_img_pdf.name
                    result_data["stats"]["dual_img_size_bytes"] = len(dual_img_bytes)

                return result_data

        except Exception as e:
            return {"success": False, "message": f"Error: {str(e)}"}

    @modal.method()
    def translate(

        self,

        pdf_base64: str,

        target_lang: str = "fr",

        pages: str = "",

        no_dual: bool = False,

        no_mono: bool = False,

    ) -> dict:
        """Translate method (callable via Modal)."""
        return self._translate_internal(pdf_base64, target_lang, pages, no_dual, no_mono)

    @modal.fastapi_endpoint(method="POST")
    def api(self, request: dict) -> dict:
        """

        FastAPI endpoint POST for PDF translation.



        Request body:

        {

            "pdf_base64": "base64_encoded_pdf",

            "target_lang": "fr",

            "pages": "1,2,3" (optional),

            "no_dual": false,

            "no_mono": false

        }

        """
        pdf_base64 = request.get("pdf_base64", "")
        target_lang = request.get("target_lang", "fr")
        pages = request.get("pages", "")
        no_dual = request.get("no_dual", False)
        no_mono = request.get("no_mono", False)

        return self._translate_internal(pdf_base64, target_lang, pages, no_dual, no_mono)

    @modal.fastapi_endpoint(method="GET")
    def health(self) -> dict:
        """Health check endpoint."""
        return {
            "status": "healthy",
            "service": "BabelDOC with Agentic AI",
            "version": "1.0.0",
            "max_pages": MAX_PAGES,
        }

    @modal.fastapi_endpoint(method="GET")
    def languages(self) -> dict:
        """Get supported languages."""
        return {
            "languages": {
                "fr": "French",
                "en": "English",
                "es": "Spanish",
                "de": "German",
                "it": "Italian",
                "pt": "Portuguese",
                "zh": "Chinese",
                "ja": "Japanese",
                "ko": "Korean",
                "ru": "Russian",
                "ar": "Arabic",
            }
        }


@app.local_entrypoint()
def main():
    """BabelDOC with Agentic AI - Local test."""
    print("BabelDOC with Agentic AI - Modal Deployment")
    print("=" * 45)
    print(f"Max pages: {MAX_PAGES} (test phase)")
    print()
    print("Deploy: modal deploy modal_deploy.py")
    print("Test:   modal serve modal_deploy.py")