#!/usr/bin/env python3 """ Script to serve the DeepSWE-Preview-FP8 model using vLLM with specific configurations: - CUDA devices 1,2 - Max model length 32000 - Tensor parallel size 2 """ import os import subprocess import sys def serve_model(): # Set CUDA_VISIBLE_DEVICES to use only GPUs 1 and 2 os.environ["CUDA_VISIBLE_DEVICES"] = "1,2" # Build the vLLM command cmd = [ "python", "-m", "vllm.entrypoints.openai.api_server", "--host", "0.0.0.0", "--port", "8550", "--model", "/home/op/DeepSWE-Preview-FP8", # Current directory "--max-model-len", "32000", "--tensor-parallel-size", "2", "--pipeline-parallel-size", "1", ] print("Starting vLLM server with the following configuration:") print(f"CUDA_VISIBLE_DEVICES: {os.environ['CUDA_VISIBLE_DEVICES']}") print(f"Model path: /home/op/DeepSWE-Preview-FP8") print(f"Max model length: 32000") print(f"Tensor parallel size: 2") print(f"Pipeline parallel size: 1") print("\nCommand:", " ".join(cmd)) print("\n" + "="*50) # Run the command try: subprocess.run(cmd, check=True) except subprocess.CalledProcessError as e: print(f"Error running vLLM server: {e}") sys.exit(1) except KeyboardInterrupt: print("\nServer stopped by user") sys.exit(0) if __name__ == "__main__": serve_model()