File size: 3,855 Bytes
05944f9
538c943
05944f9
 
538c943
 
 
 
 
 
9c2f3d5
05944f9
538c943
47309bf
05944f9
538c943
9c2f3d5
 
 
05944f9
 
 
 
538c943
05944f9
 
538c943
05944f9
 
 
 
 
 
9c2f3d5
05944f9
 
 
 
 
538c943
05944f9
 
 
 
 
 
 
 
 
 
9c2f3d5
05944f9
 
 
 
 
538c943
 
05944f9
538c943
05944f9
 
538c943
05944f9
47309bf
05944f9
 
47309bf
05944f9
 
47309bf
05944f9
9c2f3d5
538c943
9c2f3d5
05944f9
 
 
9c2f3d5
 
05944f9
 
 
 
 
 
9c2f3d5
538c943
 
05944f9
538c943
 
05944f9
 
 
 
 
538c943
05944f9
47309bf
05944f9
47309bf
05944f9
 
 
 
 
 
 
9c2f3d5
05944f9
 
 
 
 
 
47309bf
 
 
 
05944f9
 
 
 
 
538c943
05944f9
47309bf
05944f9
538c943
05944f9
 
 
 
 
9c2f3d5
05944f9
538c943
05944f9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# app/model.py
"""
CPU-optimized model loading with automatic GGUF download.
Uses llama.cpp for 2-4x faster inference on CPU.
"""

import gc
import os
from typing import Generator, Optional

from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# Global singleton
_llama_model: Optional[Llama] = None

# Model configuration - Mungert/Nanbeige4-3B-Thinking-2511-GGUF
MODEL_REPO = "Mungert/Nanbeige4-3B-Thinking-2511-GGUF"
MODEL_FILE = "Nanbeige4-3B-Thinking-2511-iq2_m.gguf"  # iq2_m = 2-bit, very fast, good quality
CACHE_DIR = "/tmp/models"


def download_gguf_model() -> str:
    """
    Download GGUF model from Hugging Face if not exists.
    Returns local path to model file.
    """
    os.makedirs(CACHE_DIR, exist_ok=True)
    local_path = os.path.join(CACHE_DIR, MODEL_FILE)
    
    # Agar already downloaded hai
    if os.path.exists(local_path):
        print(f"GGUF model already exists: {local_path}")
        print(f"Size: {os.path.getsize(local_path) / (1024*1024):.1f} MB")
        return local_path
    
    print(f"Downloading GGUF model: {MODEL_FILE}")
    print(f"From: {MODEL_REPO}")
    print("This may take a few minutes...")
    
    try:
        # Download from Hugging Face
        downloaded_path = hf_hub_download(
            repo_id=MODEL_REPO,
            filename=MODEL_FILE,
            cache_dir=CACHE_DIR,
            local_dir=CACHE_DIR,
            local_dir_use_symlinks=False
        )
        print(f"Model downloaded to: {downloaded_path}")
        print(f"Size: {os.path.getsize(downloaded_path) / (1024*1024):.1f} MB")
        return downloaded_path
        
    except Exception as e:
        print(f"Error downloading GGUF model: {e}")
        raise


def load_model() -> Llama:
    """
    Load GGUF model with llama.cpp (optimized for CPU).
    Downloads automatically if not present.
    """
    global _llama_model
    
    if _llama_model is not None:
        return _llama_model
    
    # Download if needed
    model_path = download_gguf_model()
    
    print("Loading GGUF model with llama.cpp (CPU optimized)...")
    print("Using iq2_m quantization (2-bit, very fast)")
    
    # CPU optimizations for HF Spaces (2 vCPU, limited RAM)
    _llama_model = Llama(
        model_path=model_path,
        n_ctx=2048,        # Context window
        n_threads=2,       # HF Spaces free tier has 2 vCPUs
        n_batch=256,       # Smaller batch for memory efficiency
        verbose=False,     # Quiet mode
        use_mmap=True,     # Memory mapping for faster loading
        use_mlock=False,   # Don't lock memory (HF Spaces constraint)
    )
    
    print(f"Model loaded successfully!")
    print(f"Threads: 2 | Context: 2048 | Quantization: iq2_m (2-bit)")
    
    gc.collect()
    return _llama_model


def generate_stream(
    prompt: str,
    temperature: float = 0.7,
    max_tokens: int = 200
) -> Generator[str, None, None]:
    """
    Streaming generation with llama.cpp (FAST).
    """
    model = load_model()
    
    # llama.cpp native streaming - very fast on CPU
    stream = model(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=0.95,
        stream=True,
        stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"]
    )
    
    for output in stream:
        text = output["choices"][0]["text"]
        if text:
            yield text
    
    gc.collect()


def generate(
    prompt: str,
    temperature: float = 0.7,
    max_tokens: int = 200
) -> str:
    """
    Non-streaming generation with llama.cpp.
    """
    model = load_model()
    
    output = model(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=0.95,
        stop=["</s>", "User:", "Human:", "Assistant:", "<|im_end|>"]
    )
    
    gc.collect()
    return output["choices"][0]["text"]