# ============================================================================= # AI-Toolkit Trainer - RunPod Serverless Worker (CACHE OPTIMIZED) # ============================================================================= # CACHE OPTIMIZATION: Layers ordered from LEAST to MOST frequently changed # # Layer Order (top = rarely changes, bottom = frequently changes): # 1. Base image + system deps [RARELY CHANGE] # 2. PyTorch + CUDA [RARELY CHANGE] # 3. AI-Toolkit requirements [OCCASIONALLY CHANGE] # 4. AI-Toolkit code [OCCASIONALLY CHANGE] # 5. RunPod + HF deps [RARELY CHANGE] # 6. Directory setup [RARELY CHANGE] # 7. rp_handler.py [FREQUENTLY CHANGE] # # Build: # docker buildx build --platform linux/amd64 -f Dockerfile.runpod \ # -t aloukikaditya/trainer:latest --push . # # Build with cache: # DOCKER_BUILDKIT=1 docker build -f Dockerfile.runpod -t aio-trainer . # ============================================================================= ARG BASE_IMAGE=runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04 FROM ${BASE_IMAGE} # ----------------------------------------------------------------------------- # [LAYER 1] Environment Configuration - RARELY CHANGES # ----------------------------------------------------------------------------- ENV PYTHONUNBUFFERED=1 \ DEBIAN_FRONTEND=noninteractive \ HF_HUB_ENABLE_HF_TRANSFER=1 \ HF_HOME=/runpod-volume/huggingface-cache \ HUGGINGFACE_HUB_CACHE=/runpod-volume/huggingface-cache/hub \ TRANSFORMERS_CACHE=/runpod-volume/huggingface-cache/hub \ NO_ALBUMENTATIONS_UPDATE=1 \ DISABLE_TELEMETRY=YES \ TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.9 9.0" WORKDIR /app # ----------------------------------------------------------------------------- # [LAYER 2] System Dependencies - RARELY CHANGES # ----------------------------------------------------------------------------- RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \ apt-get update && apt-get install -y --no-install-recommends \ git git-lfs curl wget ffmpeg libgl1-mesa-glx libglib2.0-0 aria2 \ && apt-get clean && rm -rf /var/lib/apt/lists/* # ----------------------------------------------------------------------------- # [LAYER 3] PyTorch (use base image PyTorch or install specific version) # ----------------------------------------------------------------------------- # Base image already has PyTorch, verify it RUN python -c "import torch; print(f'PyTorch {torch.__version__}, CUDA {torch.version.cuda}')" # ----------------------------------------------------------------------------- # [LAYER 4] AI-Toolkit Requirements - OCCASIONALLY CHANGES # ----------------------------------------------------------------------------- # Copy only requirements first for better caching COPY ai-toolkit/requirements.txt /app/ai-toolkit/requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install --upgrade pip && \ pip install -r /app/ai-toolkit/requirements.txt # ----------------------------------------------------------------------------- # [LAYER 5] RunPod + HuggingFace Dependencies - RARELY CHANGES # ----------------------------------------------------------------------------- RUN --mount=type=cache,target=/root/.cache/pip \ pip install runpod hf_transfer huggingface_hub # ----------------------------------------------------------------------------- # [LAYER 6] AI-Toolkit Code - OCCASIONALLY CHANGES # ----------------------------------------------------------------------------- COPY ai-toolkit/ /app/ai-toolkit/ # Verify ai-toolkit is properly installed RUN python -c "import sys; sys.path.insert(0, '/app/ai-toolkit'); print('AI-Toolkit ready')" # ----------------------------------------------------------------------------- # [LAYER 7] Directory Setup - RARELY CHANGES # ----------------------------------------------------------------------------- RUN mkdir -p \ /workspace/dataset \ /workspace/output \ /runpod-volume/huggingface-cache/hub # ----------------------------------------------------------------------------- # [LAYER 8] Handler Code - FREQUENTLY CHANGES # ----------------------------------------------------------------------------- # This layer is last so changes to handler don't invalidate ai-toolkit cache COPY rp_handler.py /app/rp_handler.py # Verify handler imports work RUN python -c "from rp_handler import handler, MODEL_PRESETS; print(f'Handler ready: {list(MODEL_PRESETS.keys())}')" # ----------------------------------------------------------------------------- # Runtime Configuration # ----------------------------------------------------------------------------- EXPOSE 8000 CMD ["python", "-u", "rp_handler.py"]