Spaces:

Karan6933
/

coder-agent

Sleeping

App Files Files Community

coder-agent / app /model.py

Karan6933

Upload 8 files

9c2f3d5 verified 8 days ago

raw

history blame contribute delete

3.86 kB

	# app/model.py
	"""
	CPU-optimized model loading with automatic GGUF download.
	Uses llama.cpp for 2-4x faster inference on CPU.
	"""

	import gc
	import os
	from typing import Generator, Optional

	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama

	# Global singleton
	_llama_model: Optional[Llama] = None

	# Model configuration - Mungert/Nanbeige4-3B-Thinking-2511-GGUF
	MODEL_REPO = "Mungert/Nanbeige4-3B-Thinking-2511-GGUF"
	MODEL_FILE = "Nanbeige4-3B-Thinking-2511-iq2_m.gguf" # iq2_m = 2-bit, very fast, good quality
	CACHE_DIR = "/tmp/models"


	def download_gguf_model() -> str:
	"""
	Download GGUF model from Hugging Face if not exists.
	Returns local path to model file.
	"""
	os.makedirs(CACHE_DIR, exist_ok=True)
	local_path = os.path.join(CACHE_DIR, MODEL_FILE)

	# Agar already downloaded hai
	if os.path.exists(local_path):
	print(f"GGUF model already exists: {local_path}")
	print(f"Size: {os.path.getsize(local_path) / (1024*1024):.1f} MB")
	return local_path

	print(f"Downloading GGUF model: {MODEL_FILE}")
	print(f"From: {MODEL_REPO}")
	print("This may take a few minutes...")

	try:
	# Download from Hugging Face
	downloaded_path = hf_hub_download(
	repo_id=MODEL_REPO,
	filename=MODEL_FILE,
	cache_dir=CACHE_DIR,
	local_dir=CACHE_DIR,
	local_dir_use_symlinks=False
	)
	print(f"Model downloaded to: {downloaded_path}")
	print(f"Size: {os.path.getsize(downloaded_path) / (1024*1024):.1f} MB")
	return downloaded_path

	except Exception as e:
	print(f"Error downloading GGUF model: {e}")
	raise


	def load_model() -> Llama:
	"""
	Load GGUF model with llama.cpp (optimized for CPU).
	Downloads automatically if not present.
	"""
	global _llama_model

	if _llama_model is not None:
	return _llama_model

	# Download if needed
	model_path = download_gguf_model()

	print("Loading GGUF model with llama.cpp (CPU optimized)...")
	print("Using iq2_m quantization (2-bit, very fast)")

	# CPU optimizations for HF Spaces (2 vCPU, limited RAM)
	_llama_model = Llama(
	model_path=model_path,
	n_ctx=2048, # Context window
	n_threads=2, # HF Spaces free tier has 2 vCPUs
	n_batch=256, # Smaller batch for memory efficiency
	verbose=False, # Quiet mode
	use_mmap=True, # Memory mapping for faster loading
	use_mlock=False, # Don't lock memory (HF Spaces constraint)
	)

	print(f"Model loaded successfully!")
	print(f"Threads: 2 \| Context: 2048 \| Quantization: iq2_m (2-bit)")

	gc.collect()
	return _llama_model


	def generate_stream(
	prompt: str,
	temperature: float = 0.7,
	max_tokens: int = 200
	) -> Generator[str, None, None]:
	"""
	Streaming generation with llama.cpp (FAST).
	"""
	model = load_model()

	# llama.cpp native streaming - very fast on CPU
	stream = model(
	prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=0.95,
	stream=True,
	stop=["</s>", "User:", "Human:", "Assistant:", "<\|im_end\|>"]
	)

	for output in stream:
	text = output["choices"][0]["text"]
	if text:
	yield text

	gc.collect()


	def generate(
	prompt: str,
	temperature: float = 0.7,
	max_tokens: int = 200
	) -> str:
	"""
	Non-streaming generation with llama.cpp.
	"""
	model = load_model()

	output = model(
	prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=0.95,
	stop=["</s>", "User:", "Human:", "Assistant:", "<\|im_end\|>"]
	)

	gc.collect()
	return output["choices"][0]["text"]