import spaces
import torch
import os
from diffusers import DiffusionPipeline

MODEL_ID = 'black-forest-labs/FLUX.1-dev'

# Set custom cache directory to avoid filling Hugging Face storage limit
CUSTOM_CACHE_DIR = './flux_cache'
os.environ['HF_HOME'] = CUSTOM_CACHE_DIR
os.environ['TRANSFORMERS_CACHE'] = CUSTOM_CACHE_DIR

# Compile the model ahead-of-time for optimal performance (CPU version)
@spaces.GPU(duration=1500)  # Note: This might not work on CPU-only, but keeping for compatibility
def compile_transformer():
    # Load model with HF token if available and custom cache dir
    token = os.getenv('HF_TOKEN')
    pipe = DiffusionPipeline.from_pretrained(
        MODEL_ID, 
        torch_dtype=torch.float32,  # Use float32 for CPU
        token=token,
        cache_dir=CUSTOM_CACHE_DIR
    )
    # Note: No .to('cuda') since CUDA is not available
    
    # Skip AoT compilation for CPU - it's not supported well
    return None

# Load the model (CPU version)
def load_model():
    # Load model with HF token if available and custom cache dir
    token = os.getenv('HF_TOKEN')
    pipe = DiffusionPipeline.from_pretrained(
        MODEL_ID, 
        torch_dtype=torch.float32,  # Use float32 for CPU compatibility
        token=token,
        cache_dir=CUSTOM_CACHE_DIR
    )
    # Note: No .to('cuda') - running on CPU
    
    # Skip AoT compilation for CPU
    # compiled_transformer = compile_transformer()
    # if compiled_transformer:
    #     spaces.aoti_apply(compiled_transformer, pipe.transformer)
    
    return pipe

# Note: Removed @spaces.GPU since CUDA is not available
def generate_image(pipe, prompt):
    # Generate image with optimized settings for CPU
    image = pipe(
        prompt,
        num_inference_steps=10,  # Even fewer steps for CPU speed
        guidance_scale=3.5,
        height=256,  # Smaller size for CPU
        width=256
    ).images[0]
    
    return image