File size: 6,615 Bytes
bdcfe41
 
 
 
 
 
dac81aa
 
 
 
 
 
 
 
 
 
 
bdcfe41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dac81aa
bdcfe41
dac81aa
bdcfe41
dac81aa
 
 
 
 
bdcfe41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from threading import Thread
from transformers import TextIteratorStreamer

# ZeroGPU support for Hugging Face Spaces (effect-free if not in Spaces)
try:
    import spaces
    GPU_DECORATOR = spaces.GPU(duration=30)
    print("[SUCCESS] ZeroGPU support enabled")
except ImportError:
    # Fallback for local execution
    def GPU_DECORATOR(func):
        return func
    print("[WARNING] Running without ZeroGPU (local mode)")

# Language-specific system prompts (extracted from training set)
LANGUAGE_SYSTEM_PROMPTS = {
    # Germanic languages
    "en": "You are a helpful AI assistant.",
    "de": "Sie sind ein hilfreicher KI-Assistent.",
    "nl": "Je bent een behulpzame AI-assistent.",
    "da": "Du er en hjælpsom AI-assistent.",
    "sv": "Du är en hjälpsam AI-assistent.",
    "no": "Du er en hjelpsom AI-assistent.",
    
    # Romance languages
    "fr": "Vous êtes un assistant IA utile.",
    "es": "Eres un asistente de IA útil.",
    "it": "Sei un assistente AI utile.",
    "pt": "Você é um assistente de IA prestativo.",
    "ro": "Ești un asistent AI de ajutor.",
    
    # Slavic languages
    "pl": "Jesteś pomocnym asystentem AI.",
    "cs": "Jste užitečný AI asistent.",
    "sk": "Ste užitočný AI asistent.",
    "bg": "Вие сте полезен AI асистент.",
    "hr": "Vi ste korisni AI asistent.",
    "sl": "Vi ste koristen AI asistent.",
    
    # Baltic languages
    "lv": "Tu esi izpalīdzīgs mākslīgā intelekta asistents.",
    "lt": "Jūs esate naudingas dirbtinio intelekto asistentas.",
    "et": "Olete abivalmis tehisintellekti assistent.",
    
    # Other European languages
    "fi": "Olet avulias tekoälyavustaja.",
    "hu": "Ön egy segítőkész AI asszisztens.",
    "el": "Είστε ένας χρήσιμος βοηθός AI.",
    "mt": "Inti assistent tal-AI utli.",
    
    # Eastern European
    "ru": "Вы полезный ИИ-ассистент.",
    "uk": "Ви корисний AI-асистент.",
    
    # Asian languages (if needed)
    "zh": "你是一个有用的AI助手。",
    "ja": "あなたは役に立つAIアシスタントです。",
    "ko": "당신은 유용한 AI 어시스턴트입니다.",
    "hi": "आप एक उपयोगी एआई सहायक हैं।",
}

# Model loading
MODEL_PATH = "martinsu/tildeopen-30b-mu-instruct"

print(f"Loading model from {MODEL_PATH}...")

# Configure 4-bit quantization for memory efficiency
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,  # Nested quantization for extra memory savings
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    quantization_config=quantization_config,
    device_map="auto",
)

print("[SUCCESS] Model loaded!")

@GPU_DECORATOR
def chat(message, history, temperature, language):
    """Generate response with configurable temperature and language.
    
    This function is decorated with @spaces.GPU for dynamic GPU allocation
    in Hugging Face Spaces (30 second duration). Effect-free locally.
    """
    # Gradio ChatInterface passes history with content as string or list[dict]
    # Extract text content for apply_chat_template
    def extract_text(content):
        if isinstance(content, str):
            return content
        elif isinstance(content, list):
            # List of content items: [{"type": "text", "text": "..."}]
            texts = [item.get("text", "") for item in content if isinstance(item, dict) and item.get("type") == "text"]
            return " ".join(texts)
        return ""
    
    # Get system prompt for selected language
    system_prompt = LANGUAGE_SYSTEM_PROMPTS.get(language, LANGUAGE_SYSTEM_PROMPTS["en"])
    
    # Build messages with system prompt and text-only content
    messages = [{"role": "system", "content": system_prompt}]
    messages.extend([{"role": msg["role"], "content": extract_text(msg["content"])} for msg in history])
    messages.append({"role": "user", "content": extract_text(message) if not isinstance(message, str) else message})
    
    # Apply chat template (tokenizer handles the formatting)
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate with streaming
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=512,
        temperature=temperature,
        top_p=0.9,
        do_sample=True
    )
    
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    partial = ""
    for new_text in streamer:
        partial += new_text
        yield partial

# Prepare language options (sorted by priority)
language_options = [
    ("English", "en"),
    ("Latviešu (Latvian)", "lv"),
    ("Deutsch (German)", "de"),
    ("Français (French)", "fr"),
    ("Español (Spanish)", "es"),
    ("Italiano (Italian)", "it"),
    ("Polski (Polish)", "pl"),
    ("Português (Portuguese)", "pt"),
    ("Nederlands (Dutch)", "nl"),
    ("Svenska (Swedish)", "sv"),
    ("Česky (Czech)", "cs"),
    ("Română (Romanian)", "ro"),
    ("Dansk (Danish)", "da"),
    ("Suomi (Finnish)", "fi"),
    ("Magyar (Hungarian)", "hu"),
    ("Ελληνικά (Greek)", "el"),
    ("Български (Bulgarian)", "bg"),
    ("Lietuvių (Lithuanian)", "lt"),
    ("Eesti (Estonian)", "et"),
    ("Русский (Russian)", "ru"),
]

# Chat interface with additional inputs always visible
demo = gr.ChatInterface(
    chat,
    title="TildeOpen-30B Chat",
    description="Multilingual chat model supporting European languages (4-bit quantized), please select appropriate system prompt language (Swedish, Latvian, Estonian), default is English, that will help model to predict more desirable tokens.",
    additional_inputs=[
        gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature"),
        gr.Dropdown(choices=language_options, value="en", label="Language / System Prompt"),
    ],
    additional_inputs_accordion=gr.Accordion(label="⚙️ Settings", open=True),
    submit_btn=True,
    stop_btn=True,
    autofocus=True,
)

if __name__ == "__main__":
    demo.launch()