Spaces:
Running
Running
File size: 5,777 Bytes
557c6b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
"""
Utility functions for handling Gemma models
"""
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login, HfApi
def get_available_models():
"""
Returns a list of available Gemma models for fine-tuning.
"""
return [
"google/gemma-2-2b-it",
"google/gemma-2-9b-it",
"google/gemma-2-27b-it"
]
def load_model(model_name, token=None):
"""
Loads a model from Hugging Face Hub.
Args:
model_name: Name of the model to load
token: Hugging Face token for access to gated models
Returns:
Tuple of (model, tokenizer)
"""
if token:
login(token)
# Set appropriate device
if torch.cuda.is_available():
device = "cuda"
elif torch.backends.mps.is_available():
device = "mps" # For Apple Silicon
else:
device = "cpu"
print(f"Loading model {model_name} on {device}...")
# Load model with appropriate parameters based on device and model size
model_size = model_name.split("-")[2]
if device == "cuda":
# For CUDA devices, optimize based on model size and available memory
if model_size in ["2b", "7b"]:
# Smaller models can be loaded in BF16
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto"
)
else:
# Larger models may need additional optimizations
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
load_in_8bit=True
)
elif device == "cpu":
# For CPU, use FP32 but load 8-bit for larger models to conserve memory
if model_size in ["2b"]:
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map={"": device}
)
else:
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map={"": device},
load_in_8bit=True
)
else: # MPS (Apple Silicon)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map={"": device}
)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
return model, tokenizer
def export_model(model_path, output_dir, model_name, format="pytorch", quantization=None):
"""
Exports a fine-tuned model to the specified format.
Args:
model_path: Path to the fine-tuned model
output_dir: Directory to save the exported model
model_name: Name for the exported model
format: Export format ("pytorch", "gguf", or "safetensors")
quantization: Quantization level for GGUF format
Returns:
Dictionary with export information
"""
if not os.path.exists(model_path):
raise ValueError(f"Model path '{model_path}' does not exist")
os.makedirs(output_dir, exist_ok=True)
export_path = os.path.join(output_dir, model_name)
os.makedirs(export_path, exist_ok=True)
# Load the model and merge LoRA weights if applicable
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
# Handle different export formats
if format.lower() == "pytorch":
# Export as PyTorch model
model.save_pretrained(export_path)
tokenizer.save_pretrained(export_path)
elif format.lower() == "safetensors":
# Export as safetensors
model.save_pretrained(export_path, safe_serialization=True)
tokenizer.save_pretrained(export_path)
elif format.lower() == "gguf":
# For GGUF, we'd typically use a conversion script
# This is simplified; in practice you'd use specific tools for GGUF conversion
if quantization is not None and quantization.lower() != "none":
# Command for quantized GGUF conversion would go here
# In practice, use llama.cpp or similar tools
pass
else:
# Command for standard GGUF conversion would go here
pass
else:
raise ValueError(f"Unsupported export format: {format}")
# Calculate model size
model_size_bytes = sum(p.numel() * p.element_size() for p in model.parameters())
model_size_gb = model_size_bytes / (1024**3)
return {
"format": format.lower(),
"quantization": quantization if format.lower() == "gguf" else "None",
"model_name": model_name,
"export_path": export_path,
"model_size": f"{model_size_gb:.2f} GB"
}
def push_to_hub(model_path, repo_name, token):
"""
Pushes a fine-tuned model to Hugging Face Hub.
Args:
model_path: Path to the fine-tuned model
repo_name: Name for the repository on Hugging Face Hub
token: Hugging Face token
Returns:
URL of the uploaded model
"""
if not os.path.exists(model_path):
raise ValueError(f"Model path '{model_path}' does not exist")
login(token)
# Load the model and merge LoRA weights if applicable
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
# Push to hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)
# Get the model URL
api = HfApi()
model_url = f"https://huggingface.co/{repo_name}"
return model_url
|