Spaces:
Sleeping
Sleeping
File size: 8,309 Bytes
3eeba36 0a3cece 86e39f6 3eeba36 0a3cece 86e39f6 49d675a 86e39f6 3eeba36 86e39f6 3eeba36 9b932bc 86e39f6 3eeba36 9b932bc 86e39f6 3eeba36 86e39f6 9b932bc 86e39f6 3eeba36 86e39f6 3eeba36 86e39f6 9b932bc 86e39f6 9b932bc 3eeba36 86e39f6 3eeba36 0a3cece 3eeba36 86e39f6 3eeba36 0a3cece 3eeba36 86e39f6 3eeba36 86e39f6 0a3cece 86e39f6 3eeba36 86e39f6 0a3cece 3eeba36 0a3cece 86e39f6 9b932bc 86e39f6 0a3cece 86e39f6 0a3cece 3eeba36 0a3cece 86e39f6 0a3cece 86e39f6 0a3cece 86e39f6 0a3cece 86e39f6 0a3cece 3eeba36 0a3cece 86e39f6 0a3cece 86e39f6 0a3cece 86e39f6 0a3cece 86e39f6 3eeba36 86e39f6 3eeba36 0a3cece 86e39f6 0a3cece 3eeba36 86e39f6 0a3cece 86e39f6 0a3cece 3eeba36 86e39f6 0a3cece 86e39f6 3eeba36 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 |
#!/usr/bin/env python3
"""
Phi-3.5-MoE Expert Assistant
Robust application with CPU/GPU environment detection and dependency handling
"""
import os
import sys
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# Apply the model patch if available
try:
import model_patch
print("β
Applied model patch for handling missing dependencies")
except ImportError:
print("βΉοΈ Model patch not found, continuing without it")
# Environment detection
ON_GPU = torch.cuda.is_available()
MODEL_ID = os.getenv("HF_MODEL_ID", "microsoft/Phi-3.5-MoE-instruct")
REVISION = os.getenv("HF_REVISION")
# Configuration based on environment
if ON_GPU:
attn_impl = "sdpa" # Fast attention for GPU
dtype = torch.bfloat16 # Mixed precision for GPU
device_map = "auto" # Auto device mapping for GPU
low_cpu_mem = False # Don't need low memory usage on GPU
else:
attn_impl = "eager" # Standard attention for CPU
dtype = torch.float32 # Full precision for CPU
device_map = "cpu" # Force CPU device
low_cpu_mem = True # Enable low memory usage on CPU
print(f"π Loading model: {MODEL_ID}")
print(f"π§ Environment: {'GPU' if ON_GPU else 'CPU'}")
print(f"π Configuration: attn={attn_impl}, dtype={dtype}, device={device_map}, revision={REVISION}")
# Expert categories for query classification
EXPERT_CATEGORIES = {
"Code": ["programming", "software", "development", "coding", "algorithm", "python", "javascript", "java", "function", "code", "debug", "api", "framework", "library", "class", "method", "variable"],
"Math": ["mathematics", "calculation", "equation", "formula", "statistics", "derivative", "integral", "algebra", "calculus", "math", "solve", "calculate", "probability", "geometry", "trigonometry"],
"Reasoning": ["logic", "analysis", "reasoning", "problem-solving", "critical", "explain", "why", "how", "because", "analyze", "evaluate", "compare", "contrast", "deduce", "infer"],
"Multilingual": ["translation", "language", "multilingual", "localization", "translate", "spanish", "french", "german", "chinese", "japanese", "korean", "arabic", "russian", "portuguese"],
"General": ["general", "conversation", "assistance", "help", "hello", "hi", "what", "who", "when", "where", "tell", "describe", "explain"]
}
# Load model with robust error handling
model = None
tokenizer = None
try:
# Load tokenizer
print("π Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
trust_remote_code=True,
revision=REVISION
)
# Load model with environment-specific settings
print("π§ Loading model...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
trust_remote_code=True,
revision=REVISION,
attn_implementation=attn_impl,
dtype=dtype, # Fixed: Use dtype instead of torch_dtype
device_map=device_map,
low_cpu_mem_usage=low_cpu_mem
).eval()
print("β
Model loaded successfully!")
# Verify model works with a simple generation
print("π Running quick model test...")
test_input = tokenizer("Hello, I am", return_tensors="pt").to(device_map if device_map != "auto" else model.device)
with torch.no_grad():
test_output = model.generate(**test_input, max_new_tokens=5)
print("β
Model test successful!")
except Exception as e:
print(f"β οΈ Model loading failed: {e}")
print("β οΈ Continuing with limited functionality")
def classify_expert(query):
"""Classify query to determine which expert should handle it."""
query_lower = query.lower()
scores = {}
for expert, keywords in EXPERT_CATEGORIES.items():
score = sum(1 for keyword in keywords if keyword in query_lower)
scores[expert] = score
# Get expert with highest score, default to General if tied or no matches
max_score = max(scores.values()) if scores else 0
if max_score > 0:
experts = [expert for expert, score in scores.items() if score == max_score]
return experts[0]
return "General"
def generate_response(prompt, max_tokens=512, temperature=0.7, expert=None):
"""Generate response from the model."""
if model is None or tokenizer is None:
return "β οΈ Model not loaded. Please check the logs for errors."
try:
# Determine expert if not provided
if expert is None:
expert = classify_expert(prompt)
# Create expert-specific prompt
system_prompt = f"You are an AI assistant specialized in {expert}. "
full_prompt = f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"
# Tokenize input
inputs = tokenizer(full_prompt, return_tensors="pt")
if ON_GPU:
inputs = {k: v.to(model.device) for k, v in inputs.items()}
# Generate response
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
# Decode response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Remove the input prompt from the response
response = response[len(full_prompt):].strip()
return response
except Exception as e:
return f"β οΈ Generation failed: {str(e)}"
def create_interface():
"""Create the Gradio interface."""
with gr.Blocks(title="Phi-3.5-MoE Expert Assistant") as demo:
gr.Markdown("# π€ Phi-3.5-MoE Expert Assistant")
gr.Markdown(f"**Environment:** {'GPU' if ON_GPU else 'CPU'} | **Model:** {MODEL_ID}")
if model is None:
gr.Markdown("β οΈ **Model failed to load. Limited functionality available.**")
with gr.Row():
with gr.Column(scale=3):
prompt = gr.Textbox(
label="Your Question",
placeholder="Ask me anything...",
lines=4
)
with gr.Row():
max_tokens = gr.Slider(
minimum=50, maximum=1024, value=512, step=50,
label="Max Tokens"
)
temperature = gr.Slider(
minimum=0.1, maximum=2.0, value=0.7, step=0.1,
label="Temperature"
)
expert = gr.Dropdown(
choices=list(EXPERT_CATEGORIES.keys()),
value=None,
label="Expert (Optional)",
allow_custom_value=False
)
generate_btn = gr.Button("Generate Response", variant="primary")
with gr.Column(scale=2):
response = gr.Textbox(
label="Response",
lines=10,
interactive=False
)
# Example prompts
gr.Examples(
examples=[
["Explain quantum computing in simple terms", None],
["Write a Python function to calculate fibonacci numbers", "Code"],
["What are the benefits of renewable energy?", "General"],
["How does machine learning work?", "Reasoning"],
["Translate 'Hello, how are you?' to Spanish", "Multilingual"],
["Solve the equation 3x^2 + 5x - 2 = 0", "Math"]
],
inputs=[prompt, expert]
)
# Event handlers
generate_btn.click(
fn=generate_response,
inputs=[prompt, max_tokens, temperature, expert],
outputs=response
)
prompt.submit(
fn=generate_response,
inputs=[prompt, max_tokens, temperature, expert],
outputs=response
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
) |