import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from typing import List, Dict, Any, Optional import logging import asyncio import threading logger = logging.getLogger(__name__) class CodeModel: """5B Parameter coding model wrapper with optimized inference.""" def __init__(self): self.model_name = "bigcode/starcoder2-7b" # 7B model (closest to 5B with excellent coding) self.parameter_count = "7B" self.max_length = 16384 self.tokenizer = None self.model = None self.pipeline = None self.is_loaded = False self.device = "cuda" if torch.cuda.is_available() else "cpu" self._lock = threading.Lock() @spaces.GPU(duration=1200) # Extended duration for model loading def load_model(self): """Load the model (called via spaces decorator for optimization).""" try: logger.info(f"Loading {self.model_name} model...") # Load tokenizer and model self.tokenizer = AutoTokenizer.from_pretrained( self.model_name, trust_remote_code=True, padding_side="left" ) # Set pad token if not present if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token # Load model with optimization self.model = AutoModelForCausalLM.from_pretrained( self.model_name, torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, device_map="auto" if self.device == "cuda" else None, trust_remote_code=True, low_cpu_mem_usage=True, use_cache=True ) # Set model to evaluation mode self.model.eval() # Create pipeline for easier inference self.pipeline = pipeline( "text-generation", model=self.model, tokenizer=self.tokenizer, device=0 if self.device == "cuda" else -1, do_sample=True, temperature=0.7, top_p=0.95, repetition_penalty=1.1, max_new_tokens=2048, pad_token_id=self.tokenizer.eos_token_id ) self.is_loaded = True logger.info(f"✅ {self.model_name} loaded successfully on {self.device}") except Exception as e: logger.error(f"❌ Error loading model: {e}") self._fallback_model() def _fallback_model(self): """Fallback to a smaller model if the main model fails to load.""" try: logger.info("Trying fallback model: microsoft/DialoGPT-medium") self.model_name = "microsoft/DialoGPT-medium" self.parameter_count = "345M" self.max_length = 1024 self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token self.model = AutoModelForCausalLM.from_pretrained( self.model_name, torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, device_map="auto" if self.device == "cuda" else None ) self.pipeline = pipeline( "text-generation", model=self.model, tokenizer=self.tokenizer, device=0 if self.device == "cuda" else -1, max_new_tokens=512, pad_token_id=self.tokenizer.eos_token_id ) self.is_loaded = True logger.info(f"✅ Fallback model loaded successfully") except Exception as e: logger.error(f"❌ Fallback model also failed: {e}") self.is_loaded = False def generate( self, messages: List[Dict[str, str]], temperature: float = 0.7, max_new_tokens: int = 2048, language: str = "python" ) -> str: """Generate response from the model.""" if not self.is_loaded: return "I'm sorry, the model is not loaded yet. Please try again in a moment." try: with self._lock: # Ensure thread-safe access # Convert chat format to text if messages: # Format as conversation conversation = "" for msg in messages: role = msg["role"] content = msg["content"] if role == "system": conversation += f"System: {content}\n\n" elif role == "user": conversation += f"Human: {content}\n" elif role == "assistant": conversation += f"Assistant: {content}\n" # Add specific coding instructions if "write" in conversation.lower() or "code" in conversation.lower(): conversation += f"\n\nPlease provide clean, well-commented {language} code with proper syntax and best practices." conversation += "\nAssistant:" # Generate response with torch.no_grad(): if self.pipeline: # Use pipeline for generation outputs = self.pipeline( conversation, do_sample=True, temperature=temperature, top_p=0.95, repetition_penalty=1.1, max_new_tokens=max_new_tokens, pad_token_id=self.tokenizer.eos_token_id, eos_token_id=self.tokenizer.eos_token_id, return_full_text=False, clean_up_tokenization_spaces=True ) if outputs and len(outputs) > 0: response = outputs[0]["generated_text"].strip() return response # Fallback to direct model generation inputs = self.tokenizer.encode(conversation, return_tensors="pt").to(self.device) with torch.no_grad(): outputs = self.model.generate( inputs, do_sample=True, temperature=temperature, top_p=0.95, repetition_penalty=1.1, max_new_tokens=max_new_tokens, pad_token_id=self.tokenizer.eos_token_id, eos_token_id=self.tokenizer.eos_token_id, attention_mask=torch.ones_like(inputs) ) # Decode response response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True) return response.strip() except Exception as e: logger.error(f"Generation error: {e}") return f"I apologize, but I encountered an error while generating the response: {str(e)}" def get_model_info(self) -> Dict[str, Any]: """Get information about the loaded model.""" return { "model_name": self.model_name, "parameter_count": self.parameter_count, "max_length": self.max_length, "device": self.device, "is_loaded": self.is_loaded, "vocab_size": len(self.tokenizer) if self.tokenizer else 0 } # Global model instance for the server _global_model = None def get_model(): """Get or create the global model instance.""" global _global_model if _global_model is None: _global_model = CodeModel() # Load model asynchronously threading.Thread(target=_global_model.load_model, daemon=True).start() return _global_model def CodeModel(): """Factory function for creating CodeModel instances.""" return CodeModel()