Spaces:

smolagents
/

ml-agent

Running

App Files Files Community

akseljoonas HF Staff commited on 8 days ago

Commit

72bac94

1 Parent(s): 9615e37

inference token

Browse files

Files changed (3) hide show

agent/context_manager/manager.py +2 -0
agent/core/agent_loop.py +38 -17
backend/routes/agent.py +68 -24

agent/context_manager/manager.py CHANGED Viewed

@@ -165,10 +165,12 @@ class ContextManager:
             )
         )
         response = await acompletion(
             model=model_name,
             messages=messages_to_summarize,
             max_completion_tokens=self.compact_size,
         )
         summarized_message = Message(
             role="assistant", content=response.choices[0].message.content

             )
         )
+        api_key = os.environ.get("INFERENCE_TOKEN")
         response = await acompletion(
             model=model_name,
             messages=messages_to_summarize,
             max_completion_tokens=self.compact_size,
+            **({'api_key': api_key} if api_key and model_name.startswith('huggingface/') else {}),
         )
         summarized_message = Message(
             role="assistant", content=response.choices[0].message.content

agent/core/agent_loop.py CHANGED Viewed

@@ -5,8 +5,9 @@ Main agent implementation with integrated tool system and MCP support
 import asyncio
 import json
 import logging
-from litellm import ChatCompletionMessageToolCall, Message, ModelResponse, acompletion
 from lmnr import observe
 from agent.config import Config
@@ -17,6 +18,9 @@ from agent.tools.jobs_tool import CPU_FLAVORS
 logger = logging.getLogger(__name__)
 ToolCall = ChatCompletionMessageToolCall
 def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
@@ -41,7 +45,9 @@ def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
     return True, None
-def _needs_approval(tool_name: str, tool_args: dict, config: Config | None = None) -> bool:
     """Check if a tool call requires user approval before execution."""
     # Yolo mode: skip all approvals
     if config and config.yolo_mode:
@@ -56,19 +62,24 @@ def _needs_approval(tool_name: str, tool_args: dict, config: Config | None = Non
         operation = tool_args.get("operation", "")
         if operation not in ["run", "uv", "scheduled run", "scheduled uv"]:
             return False
         # Check if this is a CPU-only job
         # hardware_flavor is at top level of tool_args, not nested in args
-        hardware_flavor = tool_args.get("hardware_flavor") or tool_args.get("flavor") or tool_args.get("hardware") or "cpu-basic"
         is_cpu_job = hardware_flavor in CPU_FLAVORS
         if is_cpu_job:
             if config and not config.confirm_cpu_jobs:
                 return False
             return True
         return True
     # Check for file upload operations (hf_private_repos or other tools)
     if tool_name == "hf_private_repos":
         operation = tool_args.get("operation", "")
@@ -89,7 +100,13 @@ def _needs_approval(tool_name: str, tool_args: dict, config: Config | None = Non
     # hf_repo_git: destructive operations require approval
     if tool_name == "hf_repo_git":
         operation = tool_args.get("operation", "")
-        if operation in ["delete_branch", "delete_tag", "merge_pr", "create_repo", "update_repo"]:
             return True
     return False
@@ -140,6 +157,12 @@ class Handlers:
                     tool_choice="auto",
                     stream=True,
                     stream_options={"include_usage": True},
                 )
                 full_content = ""
@@ -180,13 +203,13 @@ class Handlers:
                                 tool_calls_acc[idx]["id"] = tc_delta.id
                             if tc_delta.function:
                                 if tc_delta.function.name:
-                                    tool_calls_acc[idx]["function"][
-                                        "name"
-                                    ] += tc_delta.function.name
                                 if tc_delta.function.arguments:
-                                    tool_calls_acc[idx]["function"][
-                                        "arguments"
-                                    ] += tc_delta.function.arguments
                     # Capture usage from the final chunk
                     if hasattr(chunk, "usage") and chunk.usage:
@@ -219,9 +242,7 @@ class Handlers:
                 if not tool_calls:
                     if content:
                         assistant_msg = Message(role="assistant", content=content)
-                        session.context_manager.add_message(
-                            assistant_msg, token_count
-                        )
                         final_response = content
                     break

 import asyncio
 import json
 import logging
+import os
+from litellm import ChatCompletionMessageToolCall, Message, acompletion
 from lmnr import observe
 from agent.config import Config
 logger = logging.getLogger(__name__)
 ToolCall = ChatCompletionMessageToolCall
+# Explicit inference token — needed because litellm checks HF_TOKEN before
+# HUGGINGFACE_API_KEY, and HF_TOKEN (used for Hub ops) may lack inference permissions.
+_INFERENCE_API_KEY = os.environ.get("INFERENCE_TOKEN")
 def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
     return True, None
+def _needs_approval(
+    tool_name: str, tool_args: dict, config: Config | None = None
+) -> bool:
     """Check if a tool call requires user approval before execution."""
     # Yolo mode: skip all approvals
     if config and config.yolo_mode:
         operation = tool_args.get("operation", "")
         if operation not in ["run", "uv", "scheduled run", "scheduled uv"]:
             return False
         # Check if this is a CPU-only job
         # hardware_flavor is at top level of tool_args, not nested in args
+        hardware_flavor = (
+            tool_args.get("hardware_flavor")
+            or tool_args.get("flavor")
+            or tool_args.get("hardware")
+            or "cpu-basic"
+        )
         is_cpu_job = hardware_flavor in CPU_FLAVORS
         if is_cpu_job:
             if config and not config.confirm_cpu_jobs:
                 return False
             return True
         return True
     # Check for file upload operations (hf_private_repos or other tools)
     if tool_name == "hf_private_repos":
         operation = tool_args.get("operation", "")
     # hf_repo_git: destructive operations require approval
     if tool_name == "hf_repo_git":
         operation = tool_args.get("operation", "")
+        if operation in [
+            "delete_branch",
+            "delete_tag",
+            "merge_pr",
+            "create_repo",
+            "update_repo",
+        ]:
             return True
     return False
                     tool_choice="auto",
                     stream=True,
                     stream_options={"include_usage": True},
+                    **(
+                        {"api_key": _INFERENCE_API_KEY}
+                        if _INFERENCE_API_KEY
+                        and session.config.model_name.startswith("huggingface/")
+                        else {}
+                    ),
                 )
                 full_content = ""
                                 tool_calls_acc[idx]["id"] = tc_delta.id
                             if tc_delta.function:
                                 if tc_delta.function.name:
+                                    tool_calls_acc[idx]["function"]["name"] += (
+                                        tc_delta.function.name
+                                    )
                                 if tc_delta.function.arguments:
+                                    tool_calls_acc[idx]["function"]["arguments"] += (
+                                        tc_delta.function.arguments
+                                    )
                     # Capture usage from the final chunk
                     if hasattr(chunk, "usage") and chunk.usage:
                 if not tool_calls:
                     if content:
                         assistant_msg = Message(role="assistant", content=content)
+                        session.context_manager.add_message(assistant_msg, token_count)
                         final_response = content
                     break

backend/routes/agent.py CHANGED Viewed

@@ -5,13 +5,19 @@ dependency. In dev mode (no OAUTH_CLIENT_ID), auth is bypassed automatically.
 """
 import logging
 from typing import Any
-from fastapi import APIRouter, Depends, HTTPException, Request, WebSocket, WebSocketDisconnect
 from dependencies import get_current_user, get_ws_user
 from litellm import acompletion
 from models import (
     ApprovalRequest,
     HealthResponse,
@@ -27,6 +33,31 @@ logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api", tags=["agent"])
 def _check_session_access(session_id: str, user: dict[str, Any]) -> None:
     """Verify the user has access to the given session. Raises 403 or 404."""
@@ -58,21 +89,37 @@ async def llm_health_check() -> LLMHealthResponse:
     - timeout / network → provider unreachable
     """
     model = session_manager.config.model_name
     try:
         await acompletion(
             model=model,
             messages=[{"role": "user", "content": "hi"}],
             max_tokens=1,
             timeout=10,
         )
         return LLMHealthResponse(status="ok", model=model)
     except Exception as e:
         err_str = str(e).lower()
         error_type = "unknown"
-        if "401" in err_str or "auth" in err_str or "invalid" in err_str or "api key" in err_str:
             error_type = "auth"
-        elif "402" in err_str or "credit" in err_str or "quota" in err_str or "insufficient" in err_str or "billing" in err_str:
             error_type = "credits"
         elif "429" in err_str or "rate" in err_str:
             error_type = "rate_limit"
@@ -88,14 +135,6 @@ async def llm_health_check() -> LLMHealthResponse:
         )
-AVAILABLE_MODELS = [
-    {"id": "huggingface/novita/MiniMaxAI/MiniMax-M2.1", "label": "MiniMax M2.1", "provider": "huggingface", "recommended": True},
-    {"id": "anthropic/claude-opus-4-5-20251101", "label": "Claude Opus 4.5", "provider": "anthropic", "recommended": True},
-    {"id": "huggingface/novita/moonshotai/Kimi-K2.5", "label": "Kimi K2.5", "provider": "huggingface"},
-    {"id": "huggingface/novita/zai-org/GLM-5", "label": "GLM 5", "provider": "huggingface"},
-]
 @router.get("/config/model")
 async def get_model() -> dict:
     """Get current model and available models. No auth required."""
@@ -106,9 +145,7 @@ async def get_model() -> dict:
 @router.post("/config/model")
-async def set_model(
-    body: dict, user: dict = Depends(get_current_user)
-) -> dict:
     """Set the LLM model. Applies to new conversations."""
     model_id = body.get("model")
     if not model_id:
@@ -127,6 +164,10 @@ async def generate_title(
 ) -> dict:
     """Generate a short title for a chat session based on the first user message."""
     model = session_manager.config.model_name
     try:
         response = await acompletion(
             model=model,
@@ -144,6 +185,7 @@ async def generate_title(
             max_tokens=20,
             temperature=0.3,
             timeout=8,
         )
         title = response.choices[0].message.content.strip().strip('"').strip("'")
         # Safety: cap at 50 chars
@@ -259,9 +301,7 @@ async def interrupt_session(
 @router.post("/undo/{session_id}")
-async def undo_session(
-    session_id: str, user: dict = Depends(get_current_user)
-) -> dict:
     """Undo the last turn in a session."""
     _check_session_access(session_id, user)
     success = await session_manager.undo(session_id)
@@ -312,7 +352,9 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str) -> None:
     # Authenticate the WebSocket connection
     user = await get_ws_user(websocket)
     if not user:
-        logger.warning(f"WebSocket rejected: authentication failed for session {session_id}")
         await websocket.accept()
         await websocket.close(code=4001, reason="Authentication required")
         return
@@ -340,10 +382,12 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str) -> None:
     # knows the session is alive.  The original ready event from _run_session
     # fires before the WS is connected and is always lost.
     try:
-        await websocket.send_json({
-            "event_type": "ready",
-            "data": {"message": "Agent initialized"},
-        })
     except Exception as e:
         logger.error(f"Failed to send ready event for session {session_id}: {e}")

 """
 import logging
+import os
 from typing import Any
 from dependencies import get_current_user, get_ws_user
+from fastapi import (
+    APIRouter,
+    Depends,
+    HTTPException,
+    Request,
+    WebSocket,
+    WebSocketDisconnect,
+)
 from litellm import acompletion
 from models import (
     ApprovalRequest,
     HealthResponse,
 router = APIRouter(prefix="/api", tags=["agent"])
+AVAILABLE_MODELS = [
+    {
+        "id": "huggingface/novita/MiniMaxAI/MiniMax-M2.1",
+        "label": "MiniMax M2.1",
+        "provider": "huggingface",
+        "recommended": True,
+    },
+    {
+        "id": "anthropic/claude-opus-4-5-20251101",
+        "label": "Claude Opus 4.5",
+        "provider": "anthropic",
+        "recommended": True,
+    },
+    {
+        "id": "huggingface/novita/moonshotai/Kimi-K2.5",
+        "label": "Kimi K2.5",
+        "provider": "huggingface",
+    },
+    {
+        "id": "huggingface/novita/zai-org/GLM-5",
+        "label": "GLM 5",
+        "provider": "huggingface",
+    },
+]
 def _check_session_access(session_id: str, user: dict[str, Any]) -> None:
     """Verify the user has access to the given session. Raises 403 or 404."""
     - timeout / network → provider unreachable
     """
     model = session_manager.config.model_name
+    hf_key = os.environ.get("INFERENCE_TOKEN")
+    api_key_kw = (
+        {"api_key": hf_key} if hf_key and model.startswith("huggingface/") else {}
+    )
     try:
         await acompletion(
             model=model,
             messages=[{"role": "user", "content": "hi"}],
             max_tokens=1,
             timeout=10,
+            **api_key_kw,
         )
         return LLMHealthResponse(status="ok", model=model)
     except Exception as e:
         err_str = str(e).lower()
         error_type = "unknown"
+        if (
+            "401" in err_str
+            or "auth" in err_str
+            or "invalid" in err_str
+            or "api key" in err_str
+        ):
             error_type = "auth"
+        elif (
+            "402" in err_str
+            or "credit" in err_str
+            or "quota" in err_str
+            or "insufficient" in err_str
+            or "billing" in err_str
+        ):
             error_type = "credits"
         elif "429" in err_str or "rate" in err_str:
             error_type = "rate_limit"
         )
 @router.get("/config/model")
 async def get_model() -> dict:
     """Get current model and available models. No auth required."""
 @router.post("/config/model")
+async def set_model(body: dict, user: dict = Depends(get_current_user)) -> dict:
     """Set the LLM model. Applies to new conversations."""
     model_id = body.get("model")
     if not model_id:
 ) -> dict:
     """Generate a short title for a chat session based on the first user message."""
     model = session_manager.config.model_name
+    hf_key = os.environ.get("INFERENCE_TOKEN")
+    api_key_kw = (
+        {"api_key": hf_key} if hf_key and model.startswith("huggingface/") else {}
+    )
     try:
         response = await acompletion(
             model=model,
             max_tokens=20,
             temperature=0.3,
             timeout=8,
+            **api_key_kw,
         )
         title = response.choices[0].message.content.strip().strip('"').strip("'")
         # Safety: cap at 50 chars
 @router.post("/undo/{session_id}")
+async def undo_session(session_id: str, user: dict = Depends(get_current_user)) -> dict:
     """Undo the last turn in a session."""
     _check_session_access(session_id, user)
     success = await session_manager.undo(session_id)
     # Authenticate the WebSocket connection
     user = await get_ws_user(websocket)
     if not user:
+        logger.warning(
+            f"WebSocket rejected: authentication failed for session {session_id}"
+        )
         await websocket.accept()
         await websocket.close(code=4001, reason="Authentication required")
         return
     # knows the session is alive.  The original ready event from _run_session
     # fires before the WS is connected and is always lost.
     try:
+        await websocket.send_json(
+            {
+                "event_type": "ready",
+                "data": {"message": "Agent initialized"},
+            }
+        )
     except Exception as e:
         logger.error(f"Failed to send ready event for session {session_id}: {e}")