File size: 8,309 Bytes
3eeba36
 
 
 
 
 
 
 
0a3cece
 
86e39f6
3eeba36
 
 
 
 
 
 
0a3cece
86e39f6
 
 
 
49d675a
86e39f6
 
3eeba36
 
 
 
86e39f6
3eeba36
 
 
 
9b932bc
86e39f6
 
3eeba36
 
 
 
 
 
 
 
 
 
 
 
 
 
9b932bc
86e39f6
 
3eeba36
86e39f6
 
 
 
 
9b932bc
86e39f6
3eeba36
86e39f6
 
 
 
 
 
 
3eeba36
86e39f6
9b932bc
86e39f6
9b932bc
3eeba36
 
 
 
 
 
 
86e39f6
3eeba36
 
0a3cece
3eeba36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86e39f6
 
3eeba36
0a3cece
 
3eeba36
 
 
 
 
 
 
 
86e39f6
3eeba36
86e39f6
 
0a3cece
 
86e39f6
 
 
 
 
 
 
 
 
 
 
 
3eeba36
86e39f6
 
0a3cece
 
3eeba36
0a3cece
 
86e39f6
9b932bc
86e39f6
0a3cece
86e39f6
0a3cece
3eeba36
 
 
0a3cece
 
86e39f6
0a3cece
 
86e39f6
0a3cece
 
 
 
86e39f6
0a3cece
 
 
86e39f6
0a3cece
 
3eeba36
 
 
 
 
 
0a3cece
86e39f6
0a3cece
 
86e39f6
 
 
 
 
0a3cece
86e39f6
0a3cece
86e39f6
3eeba36
 
 
 
 
 
86e39f6
3eeba36
0a3cece
 
 
86e39f6
0a3cece
3eeba36
86e39f6
0a3cece
 
86e39f6
0a3cece
3eeba36
86e39f6
0a3cece
 
 
 
 
 
86e39f6
 
 
 
3eeba36
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python3
"""
Phi-3.5-MoE Expert Assistant
Robust application with CPU/GPU environment detection and dependency handling
"""

import os
import sys
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Apply the model patch if available
try:
    import model_patch
    print("βœ… Applied model patch for handling missing dependencies")
except ImportError:
    print("ℹ️ Model patch not found, continuing without it")

# Environment detection
ON_GPU = torch.cuda.is_available()
MODEL_ID = os.getenv("HF_MODEL_ID", "microsoft/Phi-3.5-MoE-instruct")
REVISION = os.getenv("HF_REVISION")

# Configuration based on environment
if ON_GPU:
    attn_impl = "sdpa"  # Fast attention for GPU
    dtype = torch.bfloat16  # Mixed precision for GPU
    device_map = "auto"  # Auto device mapping for GPU
    low_cpu_mem = False  # Don't need low memory usage on GPU
else:
    attn_impl = "eager"  # Standard attention for CPU
    dtype = torch.float32  # Full precision for CPU
    device_map = "cpu"  # Force CPU device
    low_cpu_mem = True  # Enable low memory usage on CPU

print(f"πŸš€ Loading model: {MODEL_ID}")
print(f"πŸ”§ Environment: {'GPU' if ON_GPU else 'CPU'}")
print(f"πŸ“Š Configuration: attn={attn_impl}, dtype={dtype}, device={device_map}, revision={REVISION}")

# Expert categories for query classification
EXPERT_CATEGORIES = {
    "Code": ["programming", "software", "development", "coding", "algorithm", "python", "javascript", "java", "function", "code", "debug", "api", "framework", "library", "class", "method", "variable"],
    "Math": ["mathematics", "calculation", "equation", "formula", "statistics", "derivative", "integral", "algebra", "calculus", "math", "solve", "calculate", "probability", "geometry", "trigonometry"],
    "Reasoning": ["logic", "analysis", "reasoning", "problem-solving", "critical", "explain", "why", "how", "because", "analyze", "evaluate", "compare", "contrast", "deduce", "infer"],
    "Multilingual": ["translation", "language", "multilingual", "localization", "translate", "spanish", "french", "german", "chinese", "japanese", "korean", "arabic", "russian", "portuguese"],
    "General": ["general", "conversation", "assistance", "help", "hello", "hi", "what", "who", "when", "where", "tell", "describe", "explain"]
}

# Load model with robust error handling
model = None
tokenizer = None

try:
    # Load tokenizer
    print("πŸ“ Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_ID, 
        trust_remote_code=True, 
        revision=REVISION
    )
    
    # Load model with environment-specific settings
    print("🧠 Loading model...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        trust_remote_code=True,
        revision=REVISION,
        attn_implementation=attn_impl,
        dtype=dtype,  # Fixed: Use dtype instead of torch_dtype
        device_map=device_map,
        low_cpu_mem_usage=low_cpu_mem
    ).eval()
    
    print("βœ… Model loaded successfully!")
    
    # Verify model works with a simple generation
    print("πŸ” Running quick model test...")
    test_input = tokenizer("Hello, I am", return_tensors="pt").to(device_map if device_map != "auto" else model.device)
    with torch.no_grad():
        test_output = model.generate(**test_input, max_new_tokens=5)
    print("βœ… Model test successful!")
    
except Exception as e:
    print(f"⚠️ Model loading failed: {e}")
    print("⚠️ Continuing with limited functionality")

def classify_expert(query):
    """Classify query to determine which expert should handle it."""
    query_lower = query.lower()
    scores = {}
    
    for expert, keywords in EXPERT_CATEGORIES.items():
        score = sum(1 for keyword in keywords if keyword in query_lower)
        scores[expert] = score
    
    # Get expert with highest score, default to General if tied or no matches
    max_score = max(scores.values()) if scores else 0
    if max_score > 0:
        experts = [expert for expert, score in scores.items() if score == max_score]
        return experts[0]
    return "General"

def generate_response(prompt, max_tokens=512, temperature=0.7, expert=None):
    """Generate response from the model."""
    if model is None or tokenizer is None:
        return "⚠️ Model not loaded. Please check the logs for errors."
    
    try:
        # Determine expert if not provided
        if expert is None:
            expert = classify_expert(prompt)
        
        # Create expert-specific prompt
        system_prompt = f"You are an AI assistant specialized in {expert}. "
        full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"
        
        # Tokenize input
        inputs = tokenizer(full_prompt, return_tensors="pt")
        if ON_GPU:
            inputs = {k: v.to(model.device) for k, v in inputs.items()}
        
        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=temperature,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Remove the input prompt from the response
        response = response[len(full_prompt):].strip()
        
        return response
        
    except Exception as e:
        return f"⚠️ Generation failed: {str(e)}"

def create_interface():
    """Create the Gradio interface."""
    
    with gr.Blocks(title="Phi-3.5-MoE Expert Assistant") as demo:
        gr.Markdown("# πŸ€– Phi-3.5-MoE Expert Assistant")
        gr.Markdown(f"**Environment:** {'GPU' if ON_GPU else 'CPU'} | **Model:** {MODEL_ID}")
        
        if model is None:
            gr.Markdown("⚠️ **Model failed to load. Limited functionality available.**")
        
        with gr.Row():
            with gr.Column(scale=3):
                prompt = gr.Textbox(
                    label="Your Question",
                    placeholder="Ask me anything...",
                    lines=4
                )
                
                with gr.Row():
                    max_tokens = gr.Slider(
                        minimum=50, maximum=1024, value=512, step=50,
                        label="Max Tokens"
                    )
                    temperature = gr.Slider(
                        minimum=0.1, maximum=2.0, value=0.7, step=0.1,
                        label="Temperature"
                    )
                    expert = gr.Dropdown(
                        choices=list(EXPERT_CATEGORIES.keys()),
                        value=None,
                        label="Expert (Optional)",
                        allow_custom_value=False
                    )
                
                generate_btn = gr.Button("Generate Response", variant="primary")
            
            with gr.Column(scale=2):
                response = gr.Textbox(
                    label="Response",
                    lines=10,
                    interactive=False
                )
        
        # Example prompts
        gr.Examples(
            examples=[
                ["Explain quantum computing in simple terms", None],
                ["Write a Python function to calculate fibonacci numbers", "Code"],
                ["What are the benefits of renewable energy?", "General"],
                ["How does machine learning work?", "Reasoning"],
                ["Translate 'Hello, how are you?' to Spanish", "Multilingual"],
                ["Solve the equation 3x^2 + 5x - 2 = 0", "Math"]
            ],
            inputs=[prompt, expert]
        )
        
        # Event handlers
        generate_btn.click(
            fn=generate_response,
            inputs=[prompt, max_tokens, temperature, expert],
            outputs=response
        )
        
        prompt.submit(
            fn=generate_response,
            inputs=[prompt, max_tokens, temperature, expert],
            outputs=response
        )
    
    return demo

if __name__ == "__main__":
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )