Spaces:

Bmccloud22
/

LaunchLLM

Runtime error

App Files Files Community

Bmccloud22 commited on Nov 12

Commit

90a59c9

verified ·

1 Parent(s): 360e349

Deploy LaunchLLM - Production AI Training Platform

Browse files

Files changed (4) hide show

.gitignore +1 -0
README.md +2 -2
runpod_client.py +262 -0
runpod_manager.py +483 -0

.gitignore CHANGED Viewed

@@ -1,5 +1,6 @@
 __pycache__/
 *.py[cod]
 *.log
 .secrets/
 .gradio/

 __pycache__/
 *.py[cod]
+*.pyc
 *.log
 .secrets/
 .gradio/

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🚀
 colorFrom: blue
 colorTo: purple
 sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 license: apache-2.0
@@ -174,4 +174,4 @@ Start by clicking the **Environment** tab above and adding your HuggingFace toke
 ---
-**Built with ❤️ for domain experts who want custom AI without the complexity**

 colorFrom: blue
 colorTo: purple
 sdk: gradio
+sdk_version: 4.0.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
+**Built with ❤️ for domain experts who want custom AI without the complexity**

runpod_client.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""
+RunPod Client - Low-level GraphQL API client for RunPod
+Provides direct access to RunPod's GraphQL API for pod management.
+"""
+import os
+import requests
+from typing import Optional, List, Dict
+from dataclasses import dataclass
+@dataclass
+class PodInfo:
+    """Information about a RunPod pod"""
+    id: str
+    name: str
+    status: str
+    gpu_type: str
+    gpu_count: int
+    cost_per_hour: float
+    runtime: Optional[Dict] = None
+class RunPodClient:
+    """Low-level client for RunPod GraphQL API"""
+    def __init__(self, api_key: Optional[str] = None):
+        self.api_key = api_key or os.getenv("RUNPOD_API_KEY")
+        if not self.api_key:
+            raise ValueError("RunPod API key required. Set RUNPOD_API_KEY environment variable.")
+        self.endpoint = "https://api.runpod.io/graphql"
+        self.headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}"
+        }
+    def _query(self, query: str, variables: Optional[Dict] = None) -> Dict:
+        """Execute a GraphQL query"""
+        payload = {
+            "query": query,
+            "variables": variables or {}
+        }
+        response = requests.post(
+            self.endpoint,
+            json=payload,
+            headers=self.headers,
+            timeout=30
+        )
+        if response.status_code != 200:
+            raise Exception(f"GraphQL request failed: {response.status_code} {response.text}")
+        return response.json()
+    def list_pods(self) -> List[PodInfo]:
+        """List all pods"""
+        query = """
+        query {
+            myself {
+                pods {
+                    id
+                    name
+                    desiredStatus
+                    runtime {
+                        gpus {
+                            id
+                        }
+                    }
+                    machine {
+                        podHostId
+                    }
+                    costPerHr
+                    gpuCount
+                }
+            }
+        }
+        """
+        result = self._query(query)
+        if "errors" in result:
+            print(f"Error listing pods: {result['errors']}")
+            return []
+        pods_data = result.get("data", {}).get("myself", {}).get("pods", [])
+        pods = []
+        for pod_data in pods_data:
+            gpu_type = "GPU"  # Generic GPU type since API doesn't provide type details
+            if pod_data.get("runtime") and pod_data["runtime"].get("gpus"):
+                gpu_id = pod_data["runtime"]["gpus"][0].get("id", "")
+                if gpu_id:
+                    gpu_type = f"GPU-{gpu_id[:8]}"  # Use shortened GPU ID
+            pods.append(PodInfo(
+                id=pod_data["id"],
+                name=pod_data["name"],
+                status=pod_data.get("desiredStatus", "unknown"),
+                gpu_type=gpu_type,
+                gpu_count=pod_data.get("gpuCount", 0),
+                cost_per_hour=pod_data.get("costPerHr", 0.0),
+                runtime=pod_data.get("runtime")
+            ))
+        return pods
+    def create_pod(
+        self,
+        name: str,
+        image_name: str,
+        gpu_type_id: str,
+        gpu_count: int = 1,
+        volume_in_gb: int = 100,
+        container_disk_in_gb: int = 50,
+        ports: str = "8888/http"
+    ) -> Optional[str]:
+        """Create a new pod"""
+        query = """
+        mutation($input: PodFindAndDeployOnDemandInput!) {
+            podFindAndDeployOnDemand(input: $input) {
+                id
+                name
+                desiredStatus
+            }
+        }
+        """
+        variables = {
+            "input": {
+                "name": name,
+                "imageName": image_name,
+                "gpuTypeId": gpu_type_id,
+                "gpuCount": gpu_count,
+                "volumeInGb": volume_in_gb,
+                "containerDiskInGb": container_disk_in_gb,
+                "ports": ports,
+                "cloudType": "ALL"
+            }
+        }
+        result = self._query(query, variables)
+        if "errors" in result:
+            print(f"Error creating pod: {result['errors']}")
+            return None
+        pod_data = result.get("data", {}).get("podFindAndDeployOnDemand")
+        if pod_data:
+            return pod_data["id"]
+        return None
+    def stop_pod(self, pod_id: str) -> bool:
+        """Stop a running pod"""
+        query = """
+        mutation($input: PodStopInput!) {
+            podStop(input: $input) {
+                id
+                desiredStatus
+            }
+        }
+        """
+        variables = {
+            "input": {
+                "podId": pod_id
+            }
+        }
+        result = self._query(query, variables)
+        if "errors" in result:
+            print(f"Error stopping pod: {result['errors']}")
+            return False
+        return True
+    def terminate_pod(self, pod_id: str) -> bool:
+        """Terminate a pod"""
+        query = """
+        mutation($input: PodTerminateInput!) {
+            podTerminate(input: $input)
+        }
+        """
+        variables = {
+            "input": {
+                "podId": pod_id
+            }
+        }
+        result = self._query(query, variables)
+        if "errors" in result:
+            print(f"Error terminating pod: {result['errors']}")
+            return False
+        return True
+    def get_gpu_types(self) -> List[Dict]:
+        """Get available GPU types"""
+        query = """
+        query {
+            gpuTypes {
+                id
+                displayName
+                memoryInGb
+                secureCloud
+                communityCloud
+            }
+        }
+        """
+        result = self._query(query)
+        if "errors" in result:
+            print(f"Error getting GPU types: {result['errors']}")
+            return []
+        gpu_types = result.get("data", {}).get("gpuTypes", [])
+        return gpu_types
+    def get_pod_details(self, pod_id: str) -> Optional[Dict]:
+        """Get detailed information about a specific pod"""
+        query = """
+        query($podId: String!) {
+            pod(input: {podId: $podId}) {
+                id
+                name
+                desiredStatus
+                runtime {
+                    gpus {
+                        id
+                    }
+                    ports {
+                        ip
+                        isIpPublic
+                        privatePort
+                        publicPort
+                        type
+                    }
+                }
+                machine {
+                    podHostId
+                }
+                gpuCount
+                costPerHr
+            }
+        }
+        """
+        variables = {"podId": pod_id}
+        result = self._query(query, variables)
+        if "errors" in result:
+            print(f"Error getting pod details: {result['errors']}")
+            return None
+        return result.get("data", {}).get("pod")

runpod_manager.py ADDED Viewed

	@@ -0,0 +1,483 @@

+"""
+RunPod Manager - High-level management for RunPod instances
+Provides higher-level functions for managing RunPod instances including
+deployment, monitoring, and SSH access.
+"""
+import paramiko
+import time
+from typing import Optional, Dict, List
+from dataclasses import dataclass, field
+from runpod_client import RunPodClient, PodInfo
+@dataclass
+class DeploymentConfig:
+    """Configuration for RunPod deployment."""
+    name: str = "aura-training-pod"
+    gpu_type: str = "NVIDIA A100 80GB PCIe"
+    gpu_count: int = 1
+    storage_gb: int = 100
+    image: str = "runpod/pytorch:2.1.0-py3.10-cuda11.8.0-devel-ubuntu22.04"
+    ports: str = "8888/http,22/tcp,7860/http"  # Jupyter, SSH, Gradio
+@dataclass
+class TrainingConfig:
+    """Configuration for model training on RunPod."""
+    model_name: str = "Qwen/Qwen2.5-7B-Instruct"
+    lora_rank: int = 8
+    learning_rate: float = 2e-4
+    num_epochs: int = 3
+    batch_size: int = 4
+    gradient_accumulation_steps: int = 4
+    use_4bit: bool = True
+    max_length: int = 2048
+class RunPodManager:
+    """Manager for RunPod instances with deployment and monitoring"""
+    def __init__(self, api_key: Optional[str] = None):
+        self.client = RunPodClient(api_key)
+    def deploy_training_pod(
+        self,
+        name: str,
+        gpu_type: str = "NVIDIA A100 80GB PCIe",
+        gpu_count: int = 1,
+        storage_gb: int = 100
+    ) -> Optional[str]:
+        """Deploy a pod configured for model training"""
+        # Use PyTorch image with CUDA support
+        image = "runpod/pytorch:2.1.0-py3.10-cuda11.8.0-devel-ubuntu22.04"
+        print(f"Deploying training pod '{name}'...")
+        print(f"  GPU: {gpu_type} x{gpu_count}")
+        print(f"  Storage: {storage_gb}GB")
+        pod_id = self.client.create_pod(
+            name=name,
+            image_name=image,
+            gpu_type_id=gpu_type,
+            gpu_count=gpu_count,
+            volume_in_gb=storage_gb,
+            container_disk_in_gb=50,
+            ports="8888/http,22/tcp,7860/http"  # Jupyter, SSH, Gradio
+        )
+        if pod_id:
+            print(f"Pod created: {pod_id}")
+            print("Waiting for pod to start...")
+            time.sleep(10)  # Give it time to start
+        return pod_id
+    def get_pod_status(self, pod_id: str) -> Optional[Dict]:
+        """Get current status of a pod"""
+        pods = self.client.list_pods()
+        for pod in pods:
+            if pod.id == pod_id:
+                return {
+                    "id": pod.id,
+                    "name": pod.name,
+                    "status": pod.status,
+                    "gpu_type": pod.gpu_type,
+                    "cost_per_hour": pod.cost_per_hour
+                }
+        return None
+    def list_all_pods(self) -> List[PodInfo]:
+        """List all pods"""
+        return self.client.list_pods()
+    def stop_pod(self, pod_id: str) -> bool:
+        """Stop a running pod"""
+        print(f"Stopping pod {pod_id}...")
+        return self.client.stop_pod(pod_id)
+    def terminate_pod(self, pod_id: str) -> bool:
+        """Terminate a pod"""
+        print(f"Terminating pod {pod_id}...")
+        return self.client.terminate_pod(pod_id)
+    def get_ssh_connection(
+        self,
+        pod_ip: str,
+        username: str = "root",
+        key_file: Optional[str] = None,
+        password: Optional[str] = None
+    ) -> Optional[paramiko.SSHClient]:
+        """Get SSH connection to a pod"""
+        ssh = paramiko.SSHClient()
+        ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        try:
+            if key_file:
+                ssh.connect(
+                    pod_ip,
+                    username=username,
+                    key_filename=key_file,
+                    timeout=10
+                )
+            elif password:
+                ssh.connect(
+                    pod_ip,
+                    username=username,
+                    password=password,
+                    timeout=10
+                )
+            else:
+                print("Either key_file or password must be provided")
+                return None
+            return ssh
+        except Exception as e:
+            print(f"SSH connection failed: {e}")
+            return None
+    def execute_command(
+        self,
+        ssh: paramiko.SSHClient,
+        command: str
+    ) -> tuple[str, str]:
+        """Execute a command via SSH"""
+        stdin, stdout, stderr = ssh.exec_command(command)
+        return stdout.read().decode(), stderr.read().decode()
+    def upload_file(
+        self,
+        ssh: paramiko.SSHClient,
+        local_path: str,
+        remote_path: str
+    ) -> bool:
+        """Upload a file to the pod"""
+        try:
+            sftp = ssh.open_sftp()
+            sftp.put(local_path, remote_path)
+            sftp.close()
+            return True
+        except Exception as e:
+            print(f"File upload failed: {e}")
+            return False
+    def download_file(
+        self,
+        ssh: paramiko.SSHClient,
+        remote_path: str,
+        local_path: str
+    ) -> bool:
+        """Download a file from the pod"""
+        try:
+            sftp = ssh.open_sftp()
+            sftp.get(remote_path, local_path)
+            sftp.close()
+            return True
+        except Exception as e:
+            print(f"File download failed: {e}")
+            return False
+    def setup_training_environment(
+        self,
+        ssh: paramiko.SSHClient,
+        requirements_file: Optional[str] = None
+    ) -> bool:
+        """Setup the training environment on a pod"""
+        print("Setting up training environment...")
+        # Update pip
+        print("Updating pip...")
+        stdout, stderr = self.execute_command(ssh, "pip install --upgrade pip")
+        if requirements_file:
+            # Upload requirements file
+            print("Uploading requirements...")
+            if not self.upload_file(ssh, requirements_file, "/tmp/requirements.txt"):
+                return False
+            # Install requirements
+            print("Installing requirements...")
+            stdout, stderr = self.execute_command(
+                ssh,
+                "pip install -r /tmp/requirements.txt"
+            )
+            if stderr and "error" in stderr.lower():
+                print(f"Installation errors: {stderr}")
+                return False
+        print("Environment setup complete!")
+        return True
+    def monitor_training(
+        self,
+        ssh: paramiko.SSHClient,
+        log_file: str = "/workspace/training.log",
+        interval: int = 30
+    ):
+        """Monitor training progress"""
+        print(f"Monitoring training log: {log_file}")
+        print(f"Checking every {interval} seconds...")
+        print("Press Ctrl+C to stop monitoring\n")
+        last_line_count = 0
+        try:
+            while True:
+                # Get log file content
+                stdout, stderr = self.execute_command(
+                    ssh,
+                    f"cat {log_file} 2>/dev/null || echo 'Log file not found'"
+                )
+                lines = stdout.strip().split('\n')
+                new_lines = lines[last_line_count:]
+                if new_lines and new_lines[0] != 'Log file not found':
+                    for line in new_lines:
+                        print(line)
+                    last_line_count = len(lines)
+                time.sleep(interval)
+        except KeyboardInterrupt:
+            print("\nStopped monitoring")
+    def get_available_gpus(self) -> List[Dict]:
+        """Get list of available GPU types"""
+        return self.client.get_gpu_types()
+    def estimate_cost(
+        self,
+        gpu_type: str,
+        gpu_count: int,
+        hours: float
+    ) -> Optional[float]:
+        """Estimate cost for a training job"""
+        pods = self.client.list_pods()
+        # Find cost per hour for this GPU type
+        for pod in pods:
+            if pod.gpu_type == gpu_type and pod.gpu_count == gpu_count:
+                total_cost = pod.cost_per_hour * hours
+                return total_cost
+        return None
+    def run_training_on_pod(
+        self,
+        pod_id: str,
+        training_data: List[Dict],
+        model_name: str,
+        lora_config: Dict,
+        training_config: Dict
+    ) -> bool:
+        """Run training on RunPod pod instead of locally"""
+        import json
+        import tempfile
+        print(f"Starting remote training on pod {pod_id}...")
+        # 1. Get pod details to find SSH info
+        pod_details = self.client.get_pod_details(pod_id)
+        if not pod_details:
+            print("Error: Could not get pod details")
+            return False
+        # Extract SSH connection info
+        runtime = pod_details.get("runtime")
+        if not runtime or not runtime.get("ports"):
+            print("Error: Pod runtime not available. Pod may still be starting.")
+            return False
+        # Find SSH port
+        ssh_port = None
+        ssh_ip = None
+        for port in runtime["ports"]:
+            if port.get("privatePort") == 22:
+                ssh_ip = port.get("ip")
+                ssh_port = port.get("publicPort")
+                break
+        if not ssh_ip or not ssh_port:
+            print("Error: SSH port not found in pod details")
+            return False
+        print(f"SSH Connection: {ssh_ip}:{ssh_port}")
+        # 2. Save training data to temp file
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
+            json.dump(training_data, f)
+            data_file = f.name
+        # 3. Create training script
+        training_script = f"""
+import json
+import sys
+from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
+from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
+from datasets import Dataset
+import torch
+print("Loading training data...")
+with open('/workspace/training_data.json', 'r') as f:
+    data = json.load(f)
+print(f"Loaded {{len(data)}} training examples")
+print("Loading model: {model_name}")
+model = AutoModelForCausalLM.from_pretrained(
+    "{model_name}",
+    load_in_4bit=True,
+    device_map="auto",
+    torch_dtype=torch.float16
+)
+tokenizer = AutoTokenizer.from_pretrained("{model_name}")
+tokenizer.pad_token = tokenizer.eos_token
+print("Preparing model for training...")
+model = prepare_model_for_kbit_training(model)
+lora_config = LoraConfig(
+    r={lora_config.get('r', 16)},
+    lora_alpha={lora_config.get('lora_alpha', 32)},
+    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type=TaskType.CAUSAL_LM
+)
+model = get_peft_model(model, lora_config)
+model.print_trainable_parameters()
+print("Preparing dataset...")
+def format_data(example):
+    text = f"###Instruction: {{example['instruction']}}\\n###Response: {{example['output']}}"
+    return tokenizer(text, truncation=True, max_length=2048, padding="max_length")
+dataset = Dataset.from_list(data)
+dataset = dataset.map(format_data, batched=False)
+training_args = TrainingArguments(
+    output_dir="/workspace/outputs",
+    num_train_epochs={training_config.get('num_epochs', 3)},
+    per_device_train_batch_size={training_config.get('batch_size', 1)},
+    gradient_accumulation_steps={training_config.get('gradient_accumulation_steps', 16)},
+    learning_rate={training_config.get('learning_rate', 2e-4)},
+    logging_steps=10,
+    save_steps=100,
+    save_total_limit=2,
+    fp16=True,
+    report_to="none"
+)
+print("Starting training...")
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset
+)
+trainer.train()
+print("Saving model...")
+model.save_pretrained("/workspace/final_model")
+tokenizer.save_pretrained("/workspace/final_model")
+print("Training complete!")
+"""
+        # Save script to temp file
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+            f.write(training_script)
+            script_file = f.name
+        print("Connecting to pod via SSH...")
+        # Get path to SSH key
+        import os
+        key_path = os.path.join(os.getcwd(), ".ssh", "runpod_key")
+        if not os.path.exists(key_path):
+            print(f"Error: SSH key not found at {key_path}")
+            print("Run: ssh-keygen -t ed25519 -f .ssh/runpod_key -N ''")
+            print("Then add the public key to RunPod: https://www.runpod.io/console/user/settings")
+            return False
+        # Get SSH connection (RunPod uses root user by default)
+        ssh = self.get_ssh_connection(
+            pod_ip=ssh_ip,
+            username="root",
+            password=None,
+            key_file=key_path
+        )
+        if not ssh:
+            print("Error: Could not establish SSH connection")
+            print(f"Tried using key: {key_path}")
+            print("Verify the public key is added to RunPod: https://www.runpod.io/console/user/settings")
+            return False
+        try:
+            # Upload training data
+            print("Uploading training data...")
+            if not self.upload_file(ssh, data_file, "/workspace/training_data.json"):
+                return False
+            # Upload training script
+            print("Uploading training script...")
+            if not self.upload_file(ssh, script_file, "/workspace/train.py"):
+                return False
+            # Install required packages
+            print("Installing required packages...")
+            stdout, stderr = self.execute_command(
+                ssh,
+                "pip install transformers peft datasets accelerate bitsandbytes"
+            )
+            # Execute training
+            print("Starting training on pod...")
+            print("Training will run in the background on the pod.")
+            print("You can monitor progress by checking the pod's logs.")
+            # Run training in background with nohup
+            stdout, stderr = self.execute_command(
+                ssh,
+                "nohup python /workspace/train.py > /workspace/training.log 2>&1 &"
+            )
+            print("\nTraining initiated successfully!")
+            print("Training data uploaded to: /workspace/training_data.json")
+            print("Training script uploaded to: /workspace/train.py")
+            print("Training log available at: /workspace/training.log")
+            print("\nTo monitor progress, you can:")
+            print(f"  1. SSH to pod: ssh root@{ssh_ip} -p {ssh_port}")
+            print("  2. View logs: tail -f /workspace/training.log")
+            return True
+        except Exception as e:
+            print(f"Error during remote training setup: {e}")
+            return False
+        finally:
+            ssh.close()
+            # Clean up temp files
+            import os
+            try:
+                os.unlink(data_file)
+                os.unlink(script_file)
+            except:
+                pass