Spaces:

multimodalart
/

ai-toolkit

Running on CPU Upgrade

App Files Files Community

ai-toolkit / ui /src /app /api /hf-jobs /route.ts

apolinario

Attempt to use the API

e306cd2 3 months ago

raw

history blame contribute delete

38.9 kB

	import { NextRequest, NextResponse } from 'next/server';
	import { spawn } from 'child_process';
	import { writeFile, readFile, unlink } from 'fs/promises';
	import path from 'path';
	import { tmpdir } from 'os';

	export async function POST(request: NextRequest) {
	try {
	const body = await request.json();
	const { action, token, hardware, namespace, jobConfig, datasetRepo, participateHackathon } = body;

	switch (action) {
	case 'checkCapacity':
	try {
	if (!token) {
	return NextResponse.json({ error: 'Token required' }, { status: 400 });
	}

	const capacityStatus = await checkHFJobsCapacity(token);
	return NextResponse.json(capacityStatus);
	} catch (error: any) {
	console.error('Capacity check error:', error);
	return NextResponse.json({ error: error.message }, { status: 500 });
	}

	case 'checkStatus':
	try {
	if (!token \|\| !jobConfig?.hf_job_id) {
	return NextResponse.json({ error: 'Token and job ID required' }, { status: 400 });
	}

	const jobNamespaceOverride = jobConfig?.hf_job_namespace;
	const jobStatus = await checkHFJobStatus(token, jobConfig.hf_job_id, jobNamespaceOverride);
	return NextResponse.json({ status: jobStatus });
	} catch (error: any) {
	console.error('Job status check error:', error);
	return NextResponse.json({ error: error.message }, { status: 500 });
	}

	case 'generateScript':
	try {
	const uvScript = generateUVScript({
	jobConfig,
	datasetRepo,
	namespace,
	token: token \|\| 'YOUR_HF_TOKEN',
	});

	return NextResponse.json({
	script: uvScript,
	filename: `train_${jobConfig.config.name.replace(/[^a-zA-Z0-9]/g, '_')}.py`
	});
	} catch (error: any) {
	return NextResponse.json({ error: error.message }, { status: 500 });
	}

	case 'submitJob':
	try {
	if (!token \|\| !hardware) {
	return NextResponse.json({ error: 'Token and hardware required' }, { status: 400 });
	}

	// Generate UV script
	const uvScript = generateUVScript({
	jobConfig,
	datasetRepo,
	namespace,
	token,
	});

	// Write script to temporary file
	const scriptPath = path.join(tmpdir(), `train_${Date.now()}.py`);
	await writeFile(scriptPath, uvScript);

	// Submit HF job using uv run
	const namespaceOverride = participateHackathon ? 'lora-training-frenzi' : undefined;
	const jobId = await submitHFJobUV(
	token,
	hardware,
	scriptPath,
	namespaceOverride
	);

	const jobNamespace = namespaceOverride ?? namespace;

	return NextResponse.json({
	success: true,
	jobId,
	jobNamespace,
	message: `Job submitted successfully with ID: ${jobId}`
	});
	} catch (error: any) {
	console.error('Job submission error:', error);
	return NextResponse.json({ error: error.message }, { status: 500 });
	}

	default:
	return NextResponse.json({ error: 'Invalid action' }, { status: 400 });
	}
	} catch (error: any) {
	console.error('HF Jobs API error:', error);
	return NextResponse.json({ error: error.message }, { status: 500 });
	}
	}

	function generateUVScript({ jobConfig, datasetRepo, namespace, token }: {
	jobConfig: any;
	datasetRepo: string;
	namespace: string;
	token: string;
	}) {
	const config = jobConfig.config;
	const process = config.process[0];

	return `# /// script
	# dependencies = [
	# "torch>=2.0.0",
	# "torchvision",
	# "torchaudio",
	# "torchao==0.10.0",
	# "safetensors",
	# "diffusers @ git+https://github.com/huggingface/diffusers",
	# "transformers==4.52.4",
	# "lycoris-lora==1.8.3",
	# "flatten_json",
	# "pyyaml",
	# "oyaml",
	# "tensorboard",
	# "kornia",
	# "invisible-watermark",
	# "einops",
	# "accelerate",
	# "toml",
	# "albumentations==1.4.15",
	# "albucore==0.0.16",
	# "pydantic",
	# "omegaconf",
	# "k-diffusion",
	# "open_clip_torch",
	# "timm",
	# "prodigyopt",
	# "controlnet_aux==0.0.10",
	# "python-dotenv",
	# "bitsandbytes",
	# "hf_transfer",
	# "lpips",
	# "pytorch_fid",
	# "optimum-quanto==0.2.4",
	# "sentencepiece",
	# "huggingface_hub",
	# "peft",
	# "python-slugify",
	# "opencv-python-headless",
	# "pytorch-wavelets==1.3.0",
	# "matplotlib==3.10.1",
	# "setuptools==69.5.1",
	# "datasets==4.0.0",
	# "pyarrow==20.0.0",
	# "pillow",
	# "ftfy",
	# ]
	# ///

	import os
	import sys
	import subprocess
	import argparse
	import re
	import oyaml as yaml
	from datasets import load_dataset
	from huggingface_hub import HfApi, create_repo, upload_folder, snapshot_download
	import tempfile
	import shutil
	import glob
	from PIL import Image

	def setup_ai_toolkit():
	"""Clone and setup ai-toolkit repository"""
	repo_dir = "ai-toolkit"
	if not os.path.exists(repo_dir):
	print("Cloning ai-toolkit repository...")
	subprocess.run(
	["git", "clone", "https://github.com/ostris/ai-toolkit.git", repo_dir],
	check=True
	)
	sys.path.insert(0, os.path.abspath(repo_dir))
	return repo_dir

	def find_local_dataset_source(dataset_repo: str):
	if not dataset_repo:
	return None

	repo_stripped = dataset_repo.strip()
	candidates = []

	if os.path.isabs(repo_stripped):
	candidates.append(repo_stripped)
	else:
	candidates.append(repo_stripped)
	candidates.append(os.path.abspath(repo_stripped))

	normalized = normalize_repo_id(repo_stripped)
	if normalized:
	candidates.append(os.path.join("/datasets", normalized))

	if repo_stripped.startswith("/datasets/") and repo_stripped not in candidates:
	candidates.append(repo_stripped)

	seen = set()
	for candidate in candidates:
	if not candidate or candidate in seen:
	continue
	seen.add(candidate)
	if os.path.exists(candidate):
	return candidate

	return None


	def normalize_repo_id(dataset_repo: str) -> str:
	repo_id = dataset_repo.strip()
	if repo_id.startswith("/datasets/"):
	repo_id = repo_id[len("/datasets/"):]
	elif repo_id.startswith("datasets/"):
	repo_id = repo_id[len("datasets/"):]
	return repo_id.strip("/")


	def copy_dataset_files(source_dir: str, local_path: str):
	print(f"Collecting data files from {source_dir}")

	image_exts = {'.jpg', '.jpeg', '.png', '.webp', '.bmp'}
	video_exts = {'.mp4', '.avi', '.mov', '.webm', '.mkv', '.wmv', '.m4v', '.flv'}
	copied_images = 0
	copied_videos = 0
	copied_captions = 0

	for root, _, files in os.walk(source_dir):
	for file_name in files:
	ext = os.path.splitext(file_name)[1].lower()
	src_path = os.path.join(root, file_name)
	rel_path = os.path.relpath(src_path, source_dir)
	dest_path = os.path.join(local_path, rel_path)

	dest_dir = os.path.dirname(dest_path)
	if dest_dir and not os.path.exists(dest_dir):
	os.makedirs(dest_dir, exist_ok=True)

	if ext in image_exts:
	try:
	shutil.copy2(src_path, dest_path)
	copied_images += 1
	except Exception as img_error:
	print(f"Error copying image {src_path}: {img_error}")
	elif ext in video_exts:
	try:
	shutil.copy2(src_path, dest_path)
	copied_videos += 1
	except Exception as vid_error:
	print(f"Error copying video {src_path}: {vid_error}")
	elif ext == '.txt':
	try:
	shutil.copy2(src_path, dest_path)
	copied_captions += 1
	except Exception as txt_error:
	print(f"Error copying text file {src_path}: {txt_error}")
	else:
	try:
	shutil.copy2(src_path, dest_path)
	except Exception as other_error:
	print(f"Error copying file {src_path}: {other_error}")

	total_media = copied_images + copied_videos
	print(
	f"Prepared {copied_images} images, {copied_videos} videos, and {copied_captions} captions in {local_path}"
	)
	return total_media, copied_captions


	def download_dataset(dataset_repo: str, local_path: str):
	"""Download dataset from HF Hub as files"""
	print(f"Downloading dataset from {dataset_repo}...")

	os.makedirs(local_path, exist_ok=True)

	local_source = find_local_dataset_source(dataset_repo)
	if local_source:
	print(f"Found local dataset at {local_source}")
	media_copied, _ = copy_dataset_files(local_source, local_path)
	if media_copied > 0:
	return
	print("Local dataset did not contain media files, falling back to remote download")

	repo_id = normalize_repo_id(dataset_repo)

	if repo_id:
	try:
	print(f"Attempting snapshot download for dataset {repo_id}")
	temp_repo_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
	print(f"Downloaded repo to: {temp_repo_path}")
	print(f"Contents: {os.listdir(temp_repo_path)}")
	media_copied, _ = copy_dataset_files(temp_repo_path, local_path)
	if media_copied > 0:
	return
	print("Snapshot download did not contain media files, attempting structured dataset load")
	except Exception as snapshot_error:
	print(f"Snapshot download failed: {snapshot_error}")

	if not repo_id:
	raise ValueError("Dataset repository ID is required when no local dataset is available")

	try:
	dataset = load_dataset(repo_id, split="train")

	images_saved = 0
	captions_saved = 0

	for i, item in enumerate(dataset):
	if "image" in item and item["image"] is not None:
	image_path = os.path.join(local_path, f"image_{i:06d}.jpg")
	image = item["image"]

	if image.mode == 'RGBA':
	background = Image.new('RGB', image.size, (255, 255, 255))
	background.paste(image, mask=image.split()[-1])
	image = background
	elif image.mode not in ['RGB', 'L']:
	image = image.convert('RGB')

	image.save(image_path, 'JPEG')
	images_saved += 1

	if "text" in item and item["text"] is not None:
	caption_path = os.path.join(local_path, f"image_{i:06d}.txt")
	with open(caption_path, "w", encoding="utf-8") as f:
	f.write(item["text"])
	captions_saved += 1

	if images_saved == 0:
	raise ValueError(f"Structured dataset load completed but produced 0 images for {repo_id}")

	print(f"Downloaded {images_saved} items to {local_path}")

	except Exception as e:
	print(f"Failed to load as structured dataset: {e}")
	raise

	def create_config(dataset_path: str, output_path: str):
	"""Create training configuration"""
	import json

	# Load config from JSON string and fix boolean/null values for Python
	config_str = """${JSON.stringify(jobConfig, null, 2)}"""
	config_str = config_str.replace('true', 'True').replace('false', 'False').replace('null', 'None')
	config = eval(config_str)

	def resolve_manifest_value(value):
	if value is None:
	return None
	if isinstance(value, list):
	resolved_list = [resolve_manifest_value(v) for v in value]
	return [v for v in resolved_list if v is not None]

	if not isinstance(value, str) or value.strip() == "":
	return None

	normalized = value.replace("\\\\", "/")
	parts = [part for part in normalized.split("/") if part not in ("", ".")]
	return os.path.normpath(os.path.join(dataset_path, *parts))

	manifest_path = os.path.join(dataset_path, "manifest.json")
	manifest_data = None
	if os.path.isfile(manifest_path):
	try:
	with open(manifest_path, "r", encoding="utf-8") as manifest_file:
	manifest_data = json.load(manifest_file)
	except Exception as manifest_error:
	print(f"Failed to load dataset manifest: {manifest_error}")
	manifest_data = None

	process_config = config["config"]["process"][0]

	datasets_config = process_config.get("datasets", [])
	if manifest_data and isinstance(manifest_data, dict) and "datasets" in manifest_data:
	manifest_datasets = manifest_data.get("datasets", [])
	for idx, dataset_cfg in enumerate(datasets_config):
	manifest_entry = manifest_datasets[idx] if idx < len(manifest_datasets) else {}
	if isinstance(manifest_entry, dict):
	for key, value in manifest_entry.items():
	resolved_value = resolve_manifest_value(value)
	if resolved_value is not None and resolved_value != []:
	dataset_cfg[key] = resolved_value
	if key == "folder_path":
	dataset_cfg["dataset_path"] = resolved_value

	if "folder_path" not in dataset_cfg or not dataset_cfg["folder_path"]:
	dataset_cfg["folder_path"] = dataset_path
	dataset_cfg["dataset_path"] = dataset_path
	else:
	for dataset_cfg in datasets_config:
	dataset_cfg["folder_path"] = dataset_path
	dataset_cfg["dataset_path"] = dataset_path

	samples_config = process_config.get("sample", {}).get("samples", [])
	if manifest_data and isinstance(manifest_data, dict):
	manifest_samples = manifest_data.get("samples", [])
	for sample_entry in manifest_samples:
	if not isinstance(sample_entry, dict):
	continue
	index = sample_entry.get("index")
	ctrl_img_rel = sample_entry.get("ctrl_img")
	if (
	isinstance(index, int)
	and 0 <= index < len(samples_config)
	and ctrl_img_rel is not None
	):
	resolved_ctrl_img = resolve_manifest_value(ctrl_img_rel)
	if resolved_ctrl_img:
	samples_config[index]["ctrl_img"] = resolved_ctrl_img

	# Update training folder for cloud environment
	process_config["training_folder"] = output_path

	# Remove sqlite_db_path as it's not needed for cloud training
	if "sqlite_db_path" in process_config:
	del process_config["sqlite_db_path"]

	# Also change trainer type from ui_trainer to standard trainer to avoid UI dependencies
	if process_config["type"] == "ui_trainer":
	process_config["type"] = "sd_trainer"

	return config

	def upload_results(output_path: str, model_name: str, namespace: str, token: str, config: dict):
	"""Upload trained model to HF Hub with README generation and proper file organization"""
	import tempfile
	import shutil
	import glob
	from datetime import datetime
	from huggingface_hub import create_repo, upload_file, HfApi
	from collections import deque

	try:
	repo_id = f"{namespace}/{model_name}"

	# Create repository
	create_repo(repo_id=repo_id, token=token, exist_ok=True)

	print(f"Uploading model to {repo_id}...")

	# Create temporary directory for organized upload
	with tempfile.TemporaryDirectory() as temp_upload_dir:
	api = HfApi()

	# 1. Find and upload model files to root directory
	safetensors_files = glob.glob(os.path.join(output_path, "*", ".safetensors"), recursive=True)
	json_files = glob.glob(os.path.join(output_path, "*", ".json"), recursive=True)
	txt_files = glob.glob(os.path.join(output_path, "*", ".txt"), recursive=True)

	uploaded_files = []

	# Upload .safetensors files to root
	for file_path in safetensors_files:
	filename = os.path.basename(file_path)
	print(f"Uploading {filename} to repository root...")
	api.upload_file(
	path_or_fileobj=file_path,
	path_in_repo=filename,
	repo_id=repo_id,
	token=token
	)
	uploaded_files.append(filename)

	# Upload relevant JSON config files to root (skip metadata.json and other internal files)
	config_files_uploaded = []
	for file_path in json_files:
	filename = os.path.basename(file_path)
	# Only upload important config files, skip internal metadata
	if any(keyword in filename.lower() for keyword in ['config', 'adapter', 'lora', 'model']):
	print(f"Uploading {filename} to repository root...")
	api.upload_file(
	path_or_fileobj=file_path,
	path_in_repo=filename,
	repo_id=repo_id,
	token=token
	)
	uploaded_files.append(filename)
	config_files_uploaded.append(filename)

	def prepare_sample_metadata(samples_directory: str, sample_conf: dict):
	if not samples_directory or not os.path.isdir(samples_directory):
	return [], []

	allowed_ext = {'.jpg', '.jpeg', '.png', '.webp'}
	image_records = []
	for root, _, files in os.walk(samples_directory):
	for filename in files:
	ext = os.path.splitext(filename)[1].lower()
	if ext not in allowed_ext:
	continue
	abs_path = os.path.join(root, filename)
	try:
	mtime = os.path.getmtime(abs_path)
	except Exception:
	mtime = 0
	image_records.append((abs_path, mtime))

	if not image_records:
	return [], []

	image_records.sort(key=lambda item: (-item[1], item[0]))
	image_queue = deque(image_records)

	samples_list = sample_conf.get("samples", []) if sample_conf else []
	if not samples_list:
	legacy = sample_conf.get("prompts", []) if sample_conf else []
	samples_list = [{"prompt": prompt} for prompt in legacy if prompt]

	curated_samples = []
	for sample in samples_list:
	prompt = None
	if isinstance(sample, dict):
	prompt = sample.get("prompt")
	elif isinstance(sample, str):
	prompt = sample

	if not prompt:
	continue

	if not image_queue:
	break

	image_path, _ = image_queue.popleft()
	repo_rel_path = f"images/{os.path.basename(image_path)}"
	curated_samples.append({
	"prompt": prompt,
	"local_path": image_path,
	"repo_path": repo_rel_path,
	})

	all_files = [record[0] for record in image_records]
	return curated_samples, all_files

	samples_dir = os.path.join(output_path, "samples")
	sample_config = config.get("config", {}).get("process", [{}])[0].get("sample", {})
	curated_samples, sample_files = prepare_sample_metadata(samples_dir, sample_config)

	samples_uploaded = []
	if sample_files:
	print("Uploading sample images...")
	for file_path in sample_files:
	if not os.path.isfile(file_path):
	continue
	filename = os.path.basename(file_path)
	repo_path = f"images/{filename}"
	api.upload_file(
	path_or_fileobj=file_path,
	path_in_repo=repo_path,
	repo_id=repo_id,
	token=token
	)
	samples_uploaded.append(repo_path)

	# 3. Generate and upload README.md
	readme_content = generate_model_card_readme(
	repo_id=repo_id,
	config=config,
	model_name=model_name,
	curated_samples=curated_samples,
	uploaded_files=uploaded_files
	)

	# Create README.md file and upload to root
	readme_path = os.path.join(temp_upload_dir, "README.md")
	with open(readme_path, "w", encoding="utf-8") as f:
	f.write(readme_content)

	print("Uploading README.md to repository root...")
	api.upload_file(
	path_or_fileobj=readme_path,
	path_in_repo="README.md",
	repo_id=repo_id,
	token=token
	)

	print(f"Model uploaded successfully to https://huggingface.co/{repo_id}")
	print(f"Files uploaded: {len(uploaded_files)} model files, {len(samples_uploaded)} samples, README.md")

	except Exception as e:
	print(f"Failed to upload model: {e}")
	raise e

	def generate_model_card_readme(repo_id: str, config: dict, model_name: str, curated_samples: list = None, uploaded_files: list = None) -> str:
	"""Generate README.md content for the model card based on AI Toolkit's implementation"""
	import yaml
	import os

	try:
	# Extract configuration details
	process_config = config.get("config", {}).get("process", [{}])[0]
	model_config = process_config.get("model", {})
	train_config = process_config.get("train", {})
	sample_config = process_config.get("sample", {})

	# Gather model info
	base_model = model_config.get("name_or_path", "unknown")
	trigger_word = process_config.get("trigger_word")
	arch = model_config.get("arch", "")

	# Determine license based on base model
	if "FLUX.1-schnell" in base_model:
	license_info = {"license": "apache-2.0"}
	elif "FLUX.1-dev" in base_model:
	license_info = {
	"license": "other",
	"license_name": "flux-1-dev-non-commercial-license",
	"license_link": "https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/LICENSE.md"
	}
	else:
	license_info = {"license": "creativeml-openrail-m"}

	# Generate tags based on model architecture group
	tags = []
	lower_arch = (arch or "").lower()
	lower_model_name = (model_config.get("name_or_path", "") or "").lower()
	base_model_lower = (base_model or "").lower()

	# Define model groups based on the frontend options.ts structure
	# Group: 'image' -> text-to-image
	# Group: 'instruction' -> image-to-image
	# Group: 'video' -> check for i2v in arch name for image-to-video vs text-to-video

	image_arches = {
	'flux', 'flex1', 'flex2', 'chroma', 'lumina2',
	'qwen_image', 'hidream', 'sdxl', 'sd15', 'omnigen2'
	}
	instruction_arches = {
	'flux_kontext', 'qwen_image_edit', 'qwen_image_edit_plus', 'hidream_e1'
	}
	video_arches = {
	'wan21:1b', 'wan21_i2v:14b480p', 'wan21_i2v:14b', 'wan21:14b',
	'wan22_14b:t2v', 'wan22_14b_i2v', 'wan22_5b'
	}

	# Determine the task type based on architecture group
	if lower_arch in instruction_arches:
	tags.append("image-to-image")
	elif lower_arch in video_arches:
	# Video models: check if i2v is in the architecture name
	is_i2v = 'i2v' in lower_arch
	tags.append("image-to-video" if is_i2v else "text-to-video")
	elif lower_arch in image_arches:
	tags.append("text-to-image")
	else:
	# Fallback to text-to-image for unknown architectures
	tags.append("text-to-image")

	if "xl" in lower_arch:
	tags.append("stable-diffusion-xl")
	if "flux" in lower_arch:
	tags.append("flux")
	if "lumina" in lower_arch:
	tags.append("lumina2")
	if "sd3" in lower_arch or "v3" in lower_arch:
	tags.append("sd3")

	# Add LoRA-specific tags
	tags.extend(["lora", "diffusers", "template:sd-lora", "ai-toolkit"])

	# Generate widgets and gallery section from sample images
	curated_samples = curated_samples or []

	widgets = []
	prompt_bullets = []
	for sample in curated_samples:
	prompt_text = str(sample.get("prompt", "")).strip()
	repo_path = sample.get("repo_path")
	if not prompt_text or not repo_path:
	continue
	widgets.append({
	"text": prompt_text,
	"output": {"url": repo_path}
	})
	prompt_bullets.append(f"- {prompt_text}")

	gallery_section = ""
	if prompt_bullets:
	gallery_section = "<Gallery />\\n\\n" + "### Prompts\\n\\n" + "\\n".join(prompt_bullets) + "\\n\\n"

	# Determine torch dtype based on model
	dtype = "torch.bfloat16"
	try:
	arch_lower = arch.lower()
	except AttributeError:
	arch_lower = ""
	if "sd15" in arch_lower or "sdxl" in arch_lower:
	dtype = "torch.float16"

	# Find the main safetensors file for usage example
	main_safetensors = f"{model_name}.safetensors"
	if uploaded_files:
	safetensors_files = [f for f in uploaded_files if f.endswith('.safetensors')]
	if safetensors_files:
	preferred_name = f"{model_name}.safetensors"
	exact_match = next(
	(
	f
	for f in safetensors_files
	if os.path.basename(f) == preferred_name or f == preferred_name
	),
	None,
	)

	if exact_match:
	main_safetensors = exact_match
	else:
	def extract_step(filename: str) -> int:
	match = re.search(r"_(\d+)\.safetensors$", os.path.basename(filename))
	return int(match.group(1)) if match else -1

	safetensors_files.sort(
	key=lambda f: (extract_step(f), f),
	reverse=True,
	)
	main_safetensors = safetensors_files[0]

	# Construct YAML frontmatter
	frontmatter = {
	"tags": tags,
	"base_model": base_model,
	**license_info
	}

	if widgets:
	frontmatter["widget"] = widgets

	inference_params = {}
	sample_width = sample_config.get("width") if isinstance(sample_config, dict) else None
	sample_height = sample_config.get("height") if isinstance(sample_config, dict) else None
	if sample_width:
	inference_params["width"] = sample_width
	if sample_height:
	inference_params["height"] = sample_height
	if inference_params:
	frontmatter["inference"] = {"parameters": inference_params}

	if trigger_word:
	frontmatter["instance_prompt"] = trigger_word

	# Get first prompt for usage example
	usage_prompt = trigger_word or "a beautiful landscape"
	if widgets:
	usage_prompt = widgets[0]["text"]
	elif trigger_word:
	usage_prompt = trigger_word

	# Construct README content
	trigger_section = f"You should use \`{trigger_word}\` to trigger the image generation." if trigger_word else "No trigger words defined."

	# Build YAML frontmatter string
	frontmatter_yaml = yaml.dump(frontmatter, default_flow_style=False, allow_unicode=True, sort_keys=False).strip()

	readme_content = f"""---
	{frontmatter_yaml}
	---

	# {model_name}

	Model trained with [AI Toolkit by Ostris](https://github.com/ostris/ai-toolkit)

	{gallery_section}

	## Trigger words

	{trigger_section}

	## Download model and use it with ComfyUI, AUTOMATIC1111, SD.Next, Invoke AI, etc.

	Weights for this model are available in Safetensors format.

	[Download]({repo_id}/tree/main) them in the Files & versions tab.

	## Use it with the [🧨 diffusers library](https://github.com/huggingface/diffusers)

	\`\`\`py
	from diffusers import AutoPipelineForText2Image
	import torch

	pipeline = AutoPipelineForText2Image.from_pretrained('{base_model}', torch_dtype={dtype}).to('cuda')
	pipeline.load_lora_weights('{repo_id}', weight_name='{main_safetensors}')
	image = pipeline('{usage_prompt}').images[0]
	image.save("my_image.png")
	\`\`\`

	For more details, including weighting, merging and fusing LoRAs, check the [documentation on loading LoRAs in diffusers](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading_adapters)

	"""
	return readme_content

	except Exception as e:
	print(f"Error generating README: {e}")
	# Fallback simple README
	return f"""# {model_name}

	Model trained with [AI Toolkit by Ostris](https://github.com/ostris/ai-toolkit)

	## Download model

	Weights for this model are available in Safetensors format.

	[Download]({repo_id}/tree/main) them in the Files & versions tab.
	"""

	def main():
	# Setup environment - token comes from HF Jobs secrets
	if "HF_TOKEN" not in os.environ:
	raise ValueError("HF_TOKEN environment variable not set")

	# Install system dependencies for headless operation
	print("Installing system dependencies...")
	try:
	subprocess.run(["apt-get", "update"], check=True, capture_output=True)
	subprocess.run([
	"apt-get", "install", "-y",
	"libgl1-mesa-glx",
	"libglib2.0-0",
	"libsm6",
	"libxext6",
	"libxrender-dev",
	"libgomp1",
	"ffmpeg"
	], check=True, capture_output=True)
	print("System dependencies installed successfully")
	except subprocess.CalledProcessError as e:
	print(f"Failed to install system dependencies: {e}")
	print("Continuing without system dependencies...")

	# Setup ai-toolkit
	toolkit_dir = setup_ai_toolkit()

	# Create temporary directories
	with tempfile.TemporaryDirectory() as temp_dir:
	dataset_path = os.path.join(temp_dir, "dataset")
	output_path = os.path.join(temp_dir, "output")

	# Download dataset
	download_dataset("${datasetRepo}", dataset_path)

	# Create config
	config = create_config(dataset_path, output_path)
	config_path = os.path.join(temp_dir, "config.yaml")

	with open(config_path, "w") as f:
	yaml.dump(config, f, default_flow_style=False)

	# Run training
	print("Starting training...")
	os.chdir(toolkit_dir)

	subprocess.run([
	sys.executable, "run.py",
	config_path
	], check=True)

	print("Training completed!")

	# Upload results
	model_name = f"${jobConfig.config.name}-lora"
	upload_results(output_path, model_name, "${namespace}", os.environ["HF_TOKEN"], config)

	if __name__ == "__main__":
	main()
	`;
	}

	async function submitHFJobUV(token: string, hardware: string, scriptPath: string, namespaceOverride?: string): Promise<string> {
	return new Promise((resolve, reject) => {
	// Ensure token is available
	if (!token) {
	reject(new Error('HF_TOKEN is required'));
	return;
	}

	console.log('Setting up environment with HF_TOKEN for job submission');
	const namespaceArgs = namespaceOverride ? ` --namespace ${namespaceOverride}` : '';
	console.log(`Command: hf jobs uv run --flavor ${hardware} --timeout 5h --secrets HF_TOKEN --detach${namespaceArgs} ${scriptPath}`);

	// Use hf jobs uv run command with timeout and detach to get job ID
	const args = [
	'jobs', 'uv', 'run',
	'--flavor', hardware,
	'--timeout', '5h',
	'--secrets', 'HF_TOKEN',
	'--detach'
	];

	if (namespaceOverride) {
	args.push('--namespace', namespaceOverride);
	}

	args.push(scriptPath);

	const childProcess = spawn('hf', args, {
	env: {
	...process.env,
	HF_TOKEN: token
	}
	});

	let output = '';
	let error = '';

	childProcess.stdout.on('data', (data) => {
	const text = data.toString();
	output += text;
	console.log('HF Jobs stdout:', text);
	});

	childProcess.stderr.on('data', (data) => {
	const text = data.toString();
	error += text;
	console.log('HF Jobs stderr:', text);
	});

	childProcess.on('close', (code) => {
	console.log('HF Jobs process closed with code:', code);
	console.log('Full output:', output);
	console.log('Full error:', error);

	if (code === 0) {
	// With --detach flag, the output should be just the job ID
	const fullText = (output + ' ' + error).trim();

	// Updated patterns to handle variable-length hex job IDs (16-24+ characters)
	const jobIdPatterns = [
	/Job started with ID:\s*([a-f0-9]{16,})/i, // "Job started with ID: 68b26b73767540db9fc726ac"
	/job\s+([a-f0-9]{16,})/i, // "job 68b26b73767540db9fc726ac"
	/Job ID:\s*([a-f0-9]{16,})/i, // "Job ID: 68b26b73767540db9fc726ac"
	/created\s+job\s+([a-f0-9]{16,})/i, // "created job 68b26b73767540db9fc726ac"
	/submitted.*?job\s+([a-f0-9]{16,})/i, // "submitted ... job 68b26b73767540db9fc726ac"
	/https:\/\/huggingface\.co\/jobs\/[^\/]+\/([a-f0-9]{16,})/i, // URL pattern
	/([a-f0-9]{20,})/i, // Fallback: any 20+ char hex string
	];

	let jobId = 'unknown';

	for (const pattern of jobIdPatterns) {
	const match = fullText.match(pattern);
	if (match && match[1] && match[1] !== 'started') {
	jobId = match[1];
	console.log(`Extracted job ID using pattern: ${pattern.toString()} -> ${jobId}`);
	break;
	}
	}

	resolve(jobId);
	} else {
	reject(new Error(error \|\| output \|\| 'Failed to submit job'));
	}
	});

	childProcess.on('error', (err) => {
	console.error('HF Jobs process error:', err);
	reject(new Error(`Process error: ${err.message}`));
	});
	});
	}

	async function checkHFJobStatus(token: string, jobId: string, jobNamespace?: string): Promise<any> {
	return new Promise((resolve, reject) => {
	console.log(`Checking HF Job status for: ${jobId}`);
	const args = ['jobs', 'inspect'];

	if (jobNamespace) {
	console.log(`Using namespace override for status check: ${jobNamespace}`);
	args.push('--namespace', jobNamespace);
	}

	args.push(jobId);

	const childProcess = spawn('hf', args, {
	env: {
	...process.env,
	HF_TOKEN: token
	}
	});

	let output = '';
	let error = '';

	childProcess.stdout.on('data', (data) => {
	const text = data.toString();
	output += text;
	});

	childProcess.stderr.on('data', (data) => {
	const text = data.toString();
	error += text;
	});

	childProcess.on('close', (code) => {
	if (code === 0) {
	try {
	// Parse the JSON output from hf jobs inspect
	const jobInfo = JSON.parse(output);
	if (Array.isArray(jobInfo) && jobInfo.length > 0) {
	const job = jobInfo[0];
	resolve({
	id: job.id,
	status: job.status?.stage \|\| 'UNKNOWN',
	message: job.status?.message,
	created_at: job.created_at,
	flavor: job.flavor,
	url: job.url,
	});
	} else {
	reject(new Error('Invalid job info response'));
	}
	} catch (parseError: any) {
	console.error('Failed to parse job status:', parseError, output);
	reject(new Error('Failed to parse job status'));
	}
	} else {
	reject(new Error(error \|\| output \|\| 'Failed to check job status'));
	}
	});

	childProcess.on('error', (err) => {
	console.error('HF Jobs inspect process error:', err);
	reject(new Error(`Process error: ${err.message}`));
	});
	});
	}

	async function checkHFJobsCapacity(token: string): Promise<any> {
	try {
	console.log('Checking HF Jobs capacity for namespace: lora-training-frenzi via API');

	// Use HuggingFace API directly instead of CLI to avoid TTY issues
	const response = await fetch('https://huggingface.co/api/jobs/lora-training-frenzi', {
	headers: {
	'Authorization': `Bearer ${token}`,
	},
	});

	if (!response.ok) {
	throw new Error(`API request failed: ${response.status} ${response.statusText}`);
	}

	const jobs = await response.json();
	console.log(`Fetched ${jobs.length} total jobs from API`);

	// Count jobs with status RUNNING
	let runningCount = 0;
	for (const job of jobs) {
	const status = job.status?.stage \|\| job.status;
	if (status === 'RUNNING') {
	runningCount++;
	}
	}

	const atCapacity = runningCount >= 32;

	console.log(`\n=== FINAL COUNT ===`);
	console.log(`Found ${runningCount} RUNNING jobs. At capacity: ${atCapacity}`);
	console.log(`==================\n`);

	return {
	runningJobs: runningCount,
	atCapacity,
	capacityLimit: 32,
	};
	} catch (error: any) {
	console.error('Failed to check capacity via API:', error);
	throw new Error(`Failed to check capacity: ${error.message}`);
	}
	}