ml-sharp

Sleeping

File size: 6,802 Bytes

01504c4

"""SHARP MCP Server for programmatic access to 3D Gaussian prediction.

Run standalone:
    uv run python mcp_server.py

Or integrate with MCP clients via stdio transport.
"""

from __future__ import annotations

import json
import os
from pathlib import Path
from typing import Literal

import torch
from mcp.server.fastmcp import FastMCP

from model_utils import (
    DEFAULT_OUTPUTS_DIR,
    ModelWrapper,
    TrajectoryType,
    get_global_model,
)

MCP_PORT: int = int(os.getenv("SHARP_MCP_PORT", "49201"))

mcp = FastMCP(
    "sharp",
    description="SHARP: Single-image 3D Gaussian scene prediction",
)

# -----------------------------------------------------------------------------
# Tools
# -----------------------------------------------------------------------------


@mcp.tool()
def sharp_predict(
    image_path: str,
    render_video: bool = True,
    trajectory_type: TrajectoryType = "rotate_forward",
    num_frames: int = 60,
    fps: int = 30,
    output_long_side: int | None = None,
) -> dict:
    """Predict 3D Gaussians from a single image.

    Args:
        image_path: Absolute path to input image (jpg/png/webp).
        render_video: Whether to render a camera trajectory video (requires CUDA).
        trajectory_type: Camera trajectory type (swipe/shake/rotate/rotate_forward).
        num_frames: Number of frames for video rendering.
        fps: Frames per second for video.
        output_long_side: Output resolution (longest side). None = match input.

    Returns:
        dict with keys:
            - ply_path: Path to exported PLY file
            - video_path: Path to rendered MP4 (or null if not rendered)
            - cuda_available: Whether CUDA was available
    """
    image_path_obj = Path(image_path)
    if not image_path_obj.exists():
        raise FileNotFoundError(f"Image not found: {image_path}")

    model = get_global_model()
    video_path, ply_path = model.predict_and_maybe_render(
        image_path_obj,
        trajectory_type=trajectory_type,
        num_frames=num_frames,
        fps=fps,
        output_long_side=output_long_side,
        render_video=render_video,
    )

    return {
        "ply_path": str(ply_path),
        "video_path": str(video_path) if video_path else None,
        "cuda_available": torch.cuda.is_available(),
    }


@mcp.tool()
def sharp_render(
    ply_path: str,
    trajectory_type: TrajectoryType = "rotate_forward",
    num_frames: int = 60,
    fps: int = 30,
    output_long_side: int | None = None,
) -> dict:
    """Render a video from an existing PLY file.

    Note: This requires re-predicting from the original image since Gaussians
    are not stored in standard PLY format. For now, returns an error.
    Future versions may support loading Gaussians from PLY.

    Args:
        ply_path: Path to PLY file (from previous prediction).
        trajectory_type: Camera trajectory type.
        num_frames: Number of frames.
        fps: Frames per second.
        output_long_side: Output resolution.

    Returns:
        dict with error message (feature not yet implemented).
    """
    return {
        "error": "Rendering from PLY not yet implemented. Use sharp_predict with render_video=True.",
        "hint": "PLY files store only point data, not the full Gaussian parameters needed for rendering.",
    }


@mcp.tool()
def list_outputs() -> dict:
    """List all generated output files (PLY and MP4).

    Returns:
        dict with keys:
            - outputs_dir: Path to outputs directory
            - ply_files: List of PLY file paths
            - video_files: List of MP4 file paths
    """
    outputs_dir = DEFAULT_OUTPUTS_DIR
    ply_files = sorted(outputs_dir.glob("*.ply"))
    video_files = sorted(outputs_dir.glob("*.mp4"))

    return {
        "outputs_dir": str(outputs_dir),
        "ply_files": [str(f) for f in ply_files],
        "video_files": [str(f) for f in video_files],
    }


# -----------------------------------------------------------------------------
# Resources
# -----------------------------------------------------------------------------


@mcp.resource("sharp://info")
def get_info() -> str:
    """Get SHARP server info including GPU status and configuration."""
    cuda_available = torch.cuda.is_available()
    gpu_info = []

    if cuda_available:
        for i in range(torch.cuda.device_count()):
            props = torch.cuda.get_device_properties(i)
            gpu_info.append({
                "index": i,
                "name": props.name,
                "total_memory_gb": round(props.total_memory / (1024**3), 2),
                "compute_capability": f"{props.major}.{props.minor}",
            })

    info = {
        "model": "SHARP (Apple ml-sharp)",
        "description": "Single-image 3D Gaussian scene prediction",
        "cuda_available": cuda_available,
        "cuda_device_count": torch.cuda.device_count() if cuda_available else 0,
        "gpus": gpu_info,
        "outputs_dir": str(DEFAULT_OUTPUTS_DIR),
        "checkpoint_sources": [
            "SHARP_CHECKPOINT_PATH env var",
            "HuggingFace Hub (apple/Sharp)",
            "Upstream CDN (torch.hub)",
        ],
        "env_vars": {
            "SHARP_CHECKPOINT_PATH": os.getenv("SHARP_CHECKPOINT_PATH", "(not set)"),
            "SHARP_KEEP_MODEL_ON_DEVICE": os.getenv("SHARP_KEEP_MODEL_ON_DEVICE", "1"),
            "CUDA_VISIBLE_DEVICES": os.getenv("CUDA_VISIBLE_DEVICES", "(not set)"),
        },
    }

    return json.dumps(info, indent=2)


@mcp.resource("sharp://help")
def get_help() -> str:
    """Get usage help for the SHARP MCP server."""
    help_text = """
# SHARP MCP Server

## Tools

### sharp_predict
Predict 3D Gaussians from a single image.

Parameters:
- image_path (required): Absolute path to input image
- render_video: Whether to render MP4 (default: true, requires CUDA)
- trajectory_type: swipe | shake | rotate | rotate_forward (default: rotate_forward)
- num_frames: Number of video frames (default: 60)
- fps: Video frame rate (default: 30)
- output_long_side: Output resolution, null = match input

### list_outputs
List all generated PLY and MP4 files.

## Resources

### sharp://info
Server info, GPU status, configuration.

### sharp://help
This help text.

## Environment Variables

- SHARP_MCP_PORT: MCP server port (default: 49201)
- SHARP_CHECKPOINT_PATH: Local checkpoint path override
- SHARP_KEEP_MODEL_ON_DEVICE: Keep model on GPU (default: 1)
- CUDA_VISIBLE_DEVICES: GPU selection (e.g., "0" or "0,1")
"""
    return help_text.strip()


# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------

if __name__ == "__main__":
    # Run as stdio transport for MCP clients
    mcp.run()