#!/usr/bin/env python3
"""
Script to serve the DeepSWE-Preview-FP8 model using vLLM with specific configurations:
- CUDA devices 1,2
- Max model length 32000
- Tensor parallel size 2
"""

import os
import subprocess
import sys

def serve_model():
    # Set CUDA_VISIBLE_DEVICES to use only GPUs 1 and 2
    os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
    
    # Build the vLLM command
    cmd = [
        "python", "-m", "vllm.entrypoints.openai.api_server",
        "--host", "0.0.0.0",
        "--port", "8550",
        "--model", "/home/op/DeepSWE-Preview-FP8",  # Current directory
        "--max-model-len", "32000",
        "--tensor-parallel-size", "2",
        "--pipeline-parallel-size", "1",
    ]
    
    print("Starting vLLM server with the following configuration:")
    print(f"CUDA_VISIBLE_DEVICES: {os.environ['CUDA_VISIBLE_DEVICES']}")
    print(f"Model path: /home/op/DeepSWE-Preview-FP8")
    print(f"Max model length: 32000")
    print(f"Tensor parallel size: 2")
    print(f"Pipeline parallel size: 1")
    print("\nCommand:", " ".join(cmd))
    print("\n" + "="*50)
    
    # Run the command
    try:
        subprocess.run(cmd, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error running vLLM server: {e}")
        sys.exit(1)
    except KeyboardInterrupt:
        print("\nServer stopped by user")
        sys.exit(0)

if __name__ == "__main__":
    serve_model()