Spaces:

fsadeek
/

gemma-fine-tuning

Running

File size: 5,777 Bytes

557c6b6

"""
Utility functions for handling Gemma models
"""

import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login, HfApi

def get_available_models():
    """
    Returns a list of available Gemma models for fine-tuning.
    """
    return [
        "google/gemma-2-2b-it",
        "google/gemma-2-9b-it",
        "google/gemma-2-27b-it"
    ]

def load_model(model_name, token=None):
    """
    Loads a model from Hugging Face Hub.
    
    Args:
        model_name: Name of the model to load
        token: Hugging Face token for access to gated models
        
    Returns:
        Tuple of (model, tokenizer)
    """
    if token:
        login(token)
        
    # Set appropriate device
    if torch.cuda.is_available():
        device = "cuda"
    elif torch.backends.mps.is_available():
        device = "mps"  # For Apple Silicon
    else:
        device = "cpu"
        
    print(f"Loading model {model_name} on {device}...")
    
    # Load model with appropriate parameters based on device and model size
    model_size = model_name.split("-")[2]
    if device == "cuda":
        # For CUDA devices, optimize based on model size and available memory
        if model_size in ["2b", "7b"]:
            # Smaller models can be loaded in BF16
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.bfloat16,
                device_map="auto"
            )
        else:
            # Larger models may need additional optimizations
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                load_in_8bit=True
            )
    elif device == "cpu":
        # For CPU, use FP32 but load 8-bit for larger models to conserve memory
        if model_size in ["2b"]:
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map={"": device}
            )
        else:
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map={"": device},
                load_in_8bit=True
            )
    else:  # MPS (Apple Silicon)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map={"": device}
        )
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    return model, tokenizer

def export_model(model_path, output_dir, model_name, format="pytorch", quantization=None):
    """
    Exports a fine-tuned model to the specified format.
    
    Args:
        model_path: Path to the fine-tuned model
        output_dir: Directory to save the exported model
        model_name: Name for the exported model
        format: Export format ("pytorch", "gguf", or "safetensors")
        quantization: Quantization level for GGUF format
        
    Returns:
        Dictionary with export information
    """
    if not os.path.exists(model_path):
        raise ValueError(f"Model path '{model_path}' does not exist")
    
    os.makedirs(output_dir, exist_ok=True)
    export_path = os.path.join(output_dir, model_name)
    os.makedirs(export_path, exist_ok=True)
    
    # Load the model and merge LoRA weights if applicable
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # Handle different export formats
    if format.lower() == "pytorch":
        # Export as PyTorch model
        model.save_pretrained(export_path)
        tokenizer.save_pretrained(export_path)
    
    elif format.lower() == "safetensors":
        # Export as safetensors
        model.save_pretrained(export_path, safe_serialization=True)
        tokenizer.save_pretrained(export_path)
    
    elif format.lower() == "gguf":
        # For GGUF, we'd typically use a conversion script
        # This is simplified; in practice you'd use specific tools for GGUF conversion
        if quantization is not None and quantization.lower() != "none":
            # Command for quantized GGUF conversion would go here
            # In practice, use llama.cpp or similar tools
            pass
        else:
            # Command for standard GGUF conversion would go here
            pass
    
    else:
        raise ValueError(f"Unsupported export format: {format}")
    
    # Calculate model size
    model_size_bytes = sum(p.numel() * p.element_size() for p in model.parameters())
    model_size_gb = model_size_bytes / (1024**3)
    
    return {
        "format": format.lower(),
        "quantization": quantization if format.lower() == "gguf" else "None",
        "model_name": model_name,
        "export_path": export_path,
        "model_size": f"{model_size_gb:.2f} GB"
    }

def push_to_hub(model_path, repo_name, token):
    """
    Pushes a fine-tuned model to Hugging Face Hub.
    
    Args:
        model_path: Path to the fine-tuned model
        repo_name: Name for the repository on Hugging Face Hub
        token: Hugging Face token
        
    Returns:
        URL of the uploaded model
    """
    if not os.path.exists(model_path):
        raise ValueError(f"Model path '{model_path}' does not exist")
    
    login(token)
    
    # Load the model and merge LoRA weights if applicable
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # Push to hub
    model.push_to_hub(repo_name)
    tokenizer.push_to_hub(repo_name)
    
    # Get the model URL
    api = HfApi()
    model_url = f"https://huggingface.co/{repo_name}"
    
    return model_url