Spaces:

jlov7
/

Dynamic-Function-Calling-Agent

Sleeping

App Files Files Community

jlov7 commited on Jul 21

Commit

1b5bd3c

1 Parent(s): 5410dc5

fix: add timeout protection and optimize inference for HF Spaces

Browse files

Files changed (1) hide show

test_constrained_model.py +117 -51

test_constrained_model.py CHANGED Viewed

@@ -14,70 +14,136 @@ from typing import Dict, List
 import time
 def load_trained_model():
-    """Load our intensively trained model."""
-    print("🔄 Loading SmolLM3-3B (base model for demo)...")
     # Load base model
     base_model_name = "HuggingFaceTB/SmolLM3-3B"
-    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    model = AutoModelForCausalLM.from_pretrained(
-        base_model_name,
-        torch_dtype=torch.float32,
-        device_map="mps" if torch.backends.mps.is_available() else "auto"
-    )
-    # Note: Using base model for demo (LoRA adapter not included to keep repo size small)
-    print("🔧 Using base model (LoRA adapter excluded for size constraints)...")
-    # For production deployment, upload LoRA adapter to HF Hub and load from there
-    print("✅ Trained model loaded successfully")
-    return model, tokenizer
 def constrained_json_generate(model, tokenizer, prompt: str, schema: Dict, max_attempts: int = 3):
     """Generate JSON with multiple attempts and validation."""
     device = next(model.parameters()).device
     for attempt in range(max_attempts):
-        # Generate with different temperatures for diversity
-        temperature = 0.1 + (attempt * 0.1)
-        inputs = tokenizer(prompt, return_tensors="pt").to(device)
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=200,
-                temperature=temperature,
-                do_sample=True,
-                top_p=0.9,
-                pad_token_id=tokenizer.eos_token_id,
-                eos_token_id=tokenizer.eos_token_id
-            )
-        # Decode response
-        response = tokenizer.decode(
-            outputs[0][inputs['input_ids'].shape[1]:],
-            skip_special_tokens=True
-        ).strip()
-        # Try to parse as JSON
         try:
-            parsed = json.loads(response)
-            # Validate against schema if provided
-            if schema:
-                jsonschema.validate(parsed, schema)
-            return response, True, None
-        except json.JSONDecodeError as e:
-            if attempt == max_attempts - 1:
-                return response, False, str(e)
-        except jsonschema.ValidationError as e:
             if attempt == max_attempts - 1:
-                return response, False, f"Schema validation: {str(e)}"
-    return response, False, "Max attempts exceeded"
 def create_test_schemas():
     """Create the test schemas we're evaluating against."""

 import time
 def load_trained_model():
+    """Load our model - tries fine-tuned first, falls back to base model."""
+    print("🔄 Loading SmolLM3-3B Function-Calling Agent...")
     # Load base model
     base_model_name = "HuggingFaceTB/SmolLM3-3B"
+    try:
+        print("🔄 Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(base_model_name)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        print("🔄 Loading base model...")
+        # Use smaller data type for Hugging Face Spaces
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model_name,
+            torch_dtype=torch.float16,  # Use float16 for better memory usage
+            device_map="auto",
+            low_cpu_mem_usage=True  # Reduce memory usage during loading
+        )
+        # Try to load fine-tuned adapter from Hugging Face Hub
+        try:
+            print("🔄 Attempting to load fine-tuned adapter...")
+            # from peft import PeftModel  # Uncomment when adapter is available
+            # model = PeftModel.from_pretrained(model, "jlov7/SmolLM3-Function-Calling-LoRA")
+            # model = model.merge_and_unload()
+            # print("✅ Fine-tuned model loaded successfully!")
+            print("🔧 Fine-tuned adapter not yet available - using base model with optimized prompting")
+        except Exception as e:
+            print(f"⚠️ Could not load fine-tuned adapter: {e}")
+            print("🔧 Using base model with optimized prompting")
+        print("✅ Model loaded successfully")
+        return model, tokenizer
+    except Exception as e:
+        print(f"❌ Error loading model: {e}")
+        raise
 def constrained_json_generate(model, tokenizer, prompt: str, schema: Dict, max_attempts: int = 3):
     """Generate JSON with multiple attempts and validation."""
     device = next(model.parameters()).device
     for attempt in range(max_attempts):
         try:
+            # Generate with different temperatures for diversity
+            temperature = 0.1 + (attempt * 0.1)
+            inputs = tokenizer(prompt, return_tensors="pt").to(device)
+            # Simple timeout protection using threading (cross-platform)
+            import threading
+            result = [None]
+            error = [None]
+            def generate_with_timeout():
+                try:
+                    with torch.no_grad():
+                        outputs = model.generate(
+                            **inputs,
+                            max_new_tokens=100,  # Reduced for faster generation
+                            temperature=temperature,
+                            do_sample=True,
+                            pad_token_id=tokenizer.eos_token_id,
+                            eos_token_id=tokenizer.eos_token_id,
+                            num_return_sequences=1,
+                            use_cache=True
+                        )
+                    # Extract generated text
+                    generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
+                    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+                    # Try to extract JSON from response
+                    if "{" in response and "}" in response:
+                        # Find the first complete JSON object
+                        start = response.find("{")
+                        bracket_count = 0
+                        end = start
+                        for i, char in enumerate(response[start:], start):
+                            if char == "{":
+                                bracket_count += 1
+                            elif char == "}":
+                                bracket_count -= 1
+                                if bracket_count == 0:
+                                    end = i + 1
+                                    break
+                        json_str = response[start:end]
+                        result[0] = json_str
+                    else:
+                        result[0] = response
+                except Exception as e:
+                    error[0] = str(e)
+            # Start generation in a separate thread with timeout
+            thread = threading.Thread(target=generate_with_timeout)
+            thread.daemon = True
+            thread.start()
+            thread.join(timeout=20)  # 20-second timeout
+            if thread.is_alive():
+                return "", False, f"Generation timed out (attempt {attempt + 1})"
+            if error[0]:
+                if attempt == max_attempts - 1:
+                    return "", False, f"Generation error: {error[0]}"
+                continue
+            if result[0]:
+                # Validate JSON and schema
+                try:
+                    parsed = json.loads(result[0])
+                    jsonschema.validate(parsed, schema)
+                    return result[0], True, None
+                except (json.JSONDecodeError, jsonschema.ValidationError) as e:
+                    if attempt == max_attempts - 1:
+                        return result[0], False, f"JSON validation failed: {str(e)}"
+                    continue
+        except Exception as e:
             if attempt == max_attempts - 1:
+                return "", False, f"Generation error: {str(e)}"
+            continue
+    return "", False, "All generation attempts failed"
 def create_test_schemas():
     """Create the test schemas we're evaluating against."""