akseljoonas HF Staff commited on
Commit
e83f59a
·
1 Parent(s): 0d853ec

added whooooshing to api spec

Browse files
README.md CHANGED
@@ -75,7 +75,7 @@ HF_NAMESPACE=<hf-namespace-to-use>
75
  │ │ │ │ │ ToolRouter │ │ │ │ │ │
76
  │ │ │ │ │ ├─ explore_hf_docs │ │ │ │ │ │
77
  │ │ │ │ │ ├─ fetch_hf_docs │ │ │ │ │ │
78
- │ │ │ │ │ ├─ search_hf_api_endpoints│ │ │ │ │ │
79
  │ │ │ │ │ ├─ plan_tool │ │ │ │ │ │
80
  │ │ │ │ │ ├─ hf_jobs* │ │ │ │ │ │
81
  │ │ │ │ │ ├─ hf_private_repos* │ │ │ │ │ │
 
75
  │ │ │ │ │ ToolRouter │ │ │ │ │ │
76
  │ │ │ │ │ ├─ explore_hf_docs │ │ │ │ │ │
77
  │ │ │ │ │ ├─ fetch_hf_docs │ │ │ │ │ │
78
+ │ │ │ │ │ ├─ find_hf_api │ │ │ │ │ │
79
  │ │ │ │ │ ├─ plan_tool │ │ │ │ │ │
80
  │ │ │ │ │ ├─ hf_jobs* │ │ │ │ │ │
81
  │ │ │ │ │ ├─ hf_private_repos* │ │ │ │ │ │
agent/prompts/system_prompt_v2.yaml CHANGED
@@ -41,7 +41,7 @@ system_prompt: |
41
  - For monitoring: "trackio"
42
  - For inference: "vllm", "inference-endpoints"
43
  5. ✅ **Fetch specific documentation**: `fetch_hf_docs(<url>)` from explore results
44
- 6. ✅ **Search API endpoints if needed**: `search_hf_api_endpoints(<tag>)` for API patterns
45
 
46
  **✓ CORRECT Research Pattern:**
47
  ```python
@@ -301,10 +301,12 @@ system_prompt: |
301
  - Get complete API documentation, examples, parameters
302
  - Critical for training tasks to get current trainer configs
303
 
304
- **search_hf_api_endpoints:**
305
- - Use when building scripts that call Hub API directly
 
 
306
  - Returns curl examples with authentication patterns
307
- - Useful for advanced Hub operations
308
 
309
  ## Hub Discovery Tools (MCP)
310
 
@@ -546,7 +548,7 @@ system_prompt: |
546
 
547
  I can see the Space is failing with a Python import error. It's trying to import 'gradio' but the package isn't in requirements.txt.
548
 
549
- [Explores documentation: explore_hf_docs("gradio"), search_hf_api_endpoints for spaces endpoints (e.g. seeing logs) ]
550
 
551
  Based on the Gradio docs, I need to add gradio to requirements.txt and ensure the app.py file is correctly structured.
552
 
 
41
  - For monitoring: "trackio"
42
  - For inference: "vllm", "inference-endpoints"
43
  5. ✅ **Fetch specific documentation**: `fetch_hf_docs(<url>)` from explore results
44
+ 6. ✅ **Find API endpoints if needed**: `find_hf_api(query="space logs")` or `find_hf_api(tag="spaces")` for REST API operations
45
 
46
  **✓ CORRECT Research Pattern:**
47
  ```python
 
301
  - Get complete API documentation, examples, parameters
302
  - Critical for training tasks to get current trainer configs
303
 
304
+ **find_hf_api:**
305
+ - Find REST API endpoints by keyword search or tag browsing
306
+ - Use `query` for keyword search (e.g., "space logs", "organization members", "jwt token")
307
+ - Use `tag` to browse all endpoints in a category
308
  - Returns curl examples with authentication patterns
309
+ - Use for API-only operations: streaming logs/metrics, org management, security scans, etc.
310
 
311
  ## Hub Discovery Tools (MCP)
312
 
 
548
 
549
  I can see the Space is failing with a Python import error. It's trying to import 'gradio' but the package isn't in requirements.txt.
550
 
551
+ [Explores documentation: explore_hf_docs("gradio"), find_hf_api(query="space logs") for streaming logs]
552
 
553
  Based on the Gradio docs, I need to add gradio to requirements.txt and ensure the app.py file is correctly structured.
554
 
agent/tools/docs_tools.py CHANGED
@@ -57,6 +57,7 @@ _docs_cache: dict[str, list[dict[str, str]]] = {}
57
  _index_cache: dict[str, tuple[Any, MultifieldParser]] = {}
58
  _cache_lock = asyncio.Lock()
59
  _openapi_cache: dict[str, Any] | None = None
 
60
 
61
  # ---------------------------------------------------------------------------
62
  # Gradio Documentation
@@ -441,6 +442,113 @@ def _extract_all_tags(spec: dict[str, Any]) -> list[str]:
441
  return sorted(tags)
442
 
443
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  def _generate_curl_example(endpoint: dict[str, Any]) -> str:
445
  """Generate curl command example for an endpoint."""
446
  method = endpoint["method"]
@@ -535,26 +643,55 @@ def _format_response_info(responses: dict[str, Any]) -> str:
535
  return "\n".join(output)
536
 
537
 
538
- def _format_openapi_results(results: list[dict[str, Any]], tag: str) -> str:
 
 
 
 
 
539
  """Format OpenAPI search results with curl examples."""
540
  if not results:
541
- return f"No API endpoints found with tag '{tag}'"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
 
543
- out = f"# API Endpoints for tag: `{tag}`\n\n"
544
- out += f"Found {len(results)} endpoint(s)\n\n---\n\n"
 
 
545
 
546
  for i, ep in enumerate(results, 1):
547
  out += f"## {i}. {ep['method']} {ep['path']}\n\n"
548
 
549
- if ep["summary"]:
 
 
 
550
  out += f"**Summary:** {ep['summary']}\n\n"
551
 
552
- if ep["description"]:
553
  desc = ep["description"][:300]
554
  if len(ep["description"]) > 300:
555
  desc += "..."
556
  out += f"**Description:** {desc}\n\n"
557
 
 
 
 
558
  params_info = _format_parameters(ep.get("parameters", []))
559
  if params_info:
560
  out += params_info + "\n\n"
@@ -571,52 +708,38 @@ def _format_openapi_results(results: list[dict[str, Any]], tag: str) -> str:
571
 
572
 
573
  async def search_openapi_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
574
- """Search HuggingFace OpenAPI specification by tag."""
575
- tag = arguments.get("tag", "")
576
- if not tag:
577
- return "Error: No tag provided", False
 
 
578
 
579
  try:
580
- spec = await _fetch_openapi_spec()
581
- paths = spec.get("paths", {})
582
- servers = spec.get("servers", [])
583
- base_url = (
584
- servers[0].get("url", "https://huggingface.co")
585
- if servers
586
- else "https://huggingface.co"
587
- )
588
 
589
- results = []
590
- for path, path_item in paths.items():
591
- for method, op in path_item.items():
592
- if method not in [
593
- "get",
594
- "post",
595
- "put",
596
- "delete",
597
- "patch",
598
- "head",
599
- "options",
600
- ]:
601
- continue
602
- if tag not in op.get("tags", []):
603
- continue
604
-
605
- results.append(
606
- {
607
- "path": path,
608
- "method": method.upper(),
609
- "operationId": op.get("operationId", ""),
610
- "summary": op.get("summary", ""),
611
- "description": op.get("description", ""),
612
- "parameters": op.get("parameters", []),
613
- "request_body": op.get("requestBody", {}),
614
- "responses": op.get("responses", {}),
615
- "base_url": base_url,
616
- }
617
- )
618
-
619
- return _format_openapi_results(results, tag), True
620
 
621
  except httpx.HTTPStatusError as e:
622
  return f"HTTP error fetching OpenAPI spec: {e.response.status_code}", False
@@ -632,25 +755,42 @@ async def _get_api_search_tool_spec() -> dict[str, Any]:
632
  tags = _extract_all_tags(spec)
633
 
634
  return {
635
- "name": "search_hf_api_endpoints",
636
  "description": (
637
- "Search HuggingFace OpenAPI specification by tag to find API endpoints with curl examples. "
638
- "**Use when:** (1) Need to interact with HF Hub API directly, (2) Building scripts for repo operations, "
639
- "(3) Need authentication patterns, (4) Understanding API parameters and responses, "
640
- "(5) Need curl examples for HTTP requests. "
641
- "Returns: Endpoint paths, methods, parameters, curl examples with authentication, and response schemas. "
642
- "Tags group related operations: repos, models, datasets, inference, spaces, etc."
 
 
 
 
 
 
643
  ),
644
  "parameters": {
645
  "type": "object",
646
  "properties": {
 
 
 
 
 
 
 
 
647
  "tag": {
648
  "type": "string",
649
  "enum": tags,
650
- "description": "The API tag to search for. Each tag groups related API endpoints.",
 
 
 
651
  },
652
  },
653
- "required": ["tag"],
654
  },
655
  }
656
 
 
57
  _index_cache: dict[str, tuple[Any, MultifieldParser]] = {}
58
  _cache_lock = asyncio.Lock()
59
  _openapi_cache: dict[str, Any] | None = None
60
+ _openapi_index_cache: tuple[Any, MultifieldParser, list[dict[str, Any]]] | None = None
61
 
62
  # ---------------------------------------------------------------------------
63
  # Gradio Documentation
 
442
  return sorted(tags)
443
 
444
 
445
+ def _extract_all_endpoints(spec: dict[str, Any]) -> list[dict[str, Any]]:
446
+ """Extract all endpoints from OpenAPI spec."""
447
+ servers = spec.get("servers", [])
448
+ base_url = (
449
+ servers[0].get("url", "https://huggingface.co")
450
+ if servers
451
+ else "https://huggingface.co"
452
+ )
453
+
454
+ endpoints = []
455
+ for path, path_item in spec.get("paths", {}).items():
456
+ for method, op in path_item.items():
457
+ if method not in ["get", "post", "put", "delete", "patch", "head", "options"]:
458
+ continue
459
+ endpoints.append({
460
+ "path": path,
461
+ "method": method.upper(),
462
+ "operationId": op.get("operationId", ""),
463
+ "summary": op.get("summary", ""),
464
+ "description": op.get("description", ""),
465
+ "tags": " ".join(op.get("tags", [])),
466
+ "parameters": op.get("parameters", []),
467
+ "request_body": op.get("requestBody", {}),
468
+ "responses": op.get("responses", {}),
469
+ "base_url": base_url,
470
+ })
471
+ return endpoints
472
+
473
+
474
+ async def _build_openapi_index() -> tuple[Any, MultifieldParser, list[dict[str, Any]]]:
475
+ """Build or retrieve cached Whoosh index for OpenAPI endpoints."""
476
+ global _openapi_index_cache
477
+ async with _cache_lock:
478
+ if _openapi_index_cache is not None:
479
+ return _openapi_index_cache
480
+
481
+ spec = await _fetch_openapi_spec()
482
+ endpoints = _extract_all_endpoints(spec)
483
+
484
+ analyzer = StemmingAnalyzer()
485
+ schema = Schema(
486
+ path=ID(stored=True, unique=True),
487
+ method=ID(stored=True),
488
+ operationId=TEXT(stored=True, analyzer=analyzer),
489
+ summary=TEXT(stored=True, analyzer=analyzer),
490
+ description=TEXT(stored=True, analyzer=analyzer),
491
+ tags=TEXT(stored=True, analyzer=analyzer),
492
+ param_names=TEXT(stored=False, analyzer=analyzer),
493
+ )
494
+ storage = RamStorage()
495
+ index = storage.create_index(schema)
496
+ writer = index.writer()
497
+
498
+ for ep in endpoints:
499
+ param_names = " ".join(p.get("name", "") for p in ep.get("parameters", []))
500
+ writer.add_document(
501
+ path=ep["path"],
502
+ method=ep["method"],
503
+ operationId=ep.get("operationId", ""),
504
+ summary=ep.get("summary", ""),
505
+ description=ep.get("description", ""),
506
+ tags=ep.get("tags", ""),
507
+ param_names=param_names,
508
+ )
509
+ writer.commit()
510
+
511
+ parser = MultifieldParser(
512
+ ["summary", "description", "operationId", "tags", "param_names"],
513
+ schema=schema,
514
+ fieldboosts={"summary": 3.0, "operationId": 2.0, "description": 1.0, "tags": 1.5},
515
+ group=OrGroup,
516
+ )
517
+
518
+ async with _cache_lock:
519
+ _openapi_index_cache = (index, parser, endpoints)
520
+ return index, parser, endpoints
521
+
522
+
523
+ async def _search_openapi(
524
+ query: str, tag: str | None, limit: int = 20
525
+ ) -> tuple[list[dict[str, Any]], str | None]:
526
+ """Search OpenAPI endpoints using Whoosh. Returns (results, fallback_message)."""
527
+ index, parser, endpoints = await _build_openapi_index()
528
+
529
+ try:
530
+ query_obj = parser.parse(query)
531
+ except Exception:
532
+ return [], "Query contained unsupported syntax."
533
+
534
+ with index.searcher() as searcher:
535
+ results = searcher.search(query_obj, limit=limit * 2) # Get extra for tag filtering
536
+ matches = []
537
+ for hit in results:
538
+ # Find full endpoint data
539
+ ep = next((e for e in endpoints if e["path"] == hit["path"] and e["method"] == hit["method"]), None)
540
+ if ep is None:
541
+ continue
542
+ # Filter by tag if provided
543
+ if tag and tag not in ep.get("tags", ""):
544
+ continue
545
+ matches.append({**ep, "score": round(hit.score, 2)})
546
+ if len(matches) >= limit:
547
+ break
548
+
549
+ return matches, None if matches else "No matches found for query."
550
+
551
+
552
  def _generate_curl_example(endpoint: dict[str, Any]) -> str:
553
  """Generate curl command example for an endpoint."""
554
  method = endpoint["method"]
 
643
  return "\n".join(output)
644
 
645
 
646
+ def _format_openapi_results(
647
+ results: list[dict[str, Any]],
648
+ tag: str | None = None,
649
+ query: str | None = None,
650
+ note: str | None = None,
651
+ ) -> str:
652
  """Format OpenAPI search results with curl examples."""
653
  if not results:
654
+ if query and tag:
655
+ return f"No API endpoints found matching '{query}' in tag '{tag}'"
656
+ elif query:
657
+ return f"No API endpoints found matching '{query}'"
658
+ elif tag:
659
+ return f"No API endpoints found with tag '{tag}'"
660
+ return "No API endpoints found"
661
+
662
+ # Build header
663
+ if query and tag:
664
+ out = f"# API Endpoints matching '{query}' (tag: `{tag}`)\n\n"
665
+ elif query:
666
+ out = f"# API Endpoints matching '{query}'\n\n"
667
+ elif tag:
668
+ out = f"# API Endpoints for tag: `{tag}`\n\n"
669
+ else:
670
+ out = "# API Endpoints\n\n"
671
 
672
+ out += f"Found {len(results)} endpoint(s)"
673
+ if note:
674
+ out += f" ({note})"
675
+ out += "\n\n---\n\n"
676
 
677
  for i, ep in enumerate(results, 1):
678
  out += f"## {i}. {ep['method']} {ep['path']}\n\n"
679
 
680
+ if query and "score" in ep:
681
+ out += f"**Relevance:** {ep['score']:.2f}\n\n"
682
+
683
+ if ep.get("summary"):
684
  out += f"**Summary:** {ep['summary']}\n\n"
685
 
686
+ if ep.get("description"):
687
  desc = ep["description"][:300]
688
  if len(ep["description"]) > 300:
689
  desc += "..."
690
  out += f"**Description:** {desc}\n\n"
691
 
692
+ if ep.get("tags"):
693
+ out += f"**Tags:** {ep['tags']}\n\n"
694
+
695
  params_info = _format_parameters(ep.get("parameters", []))
696
  if params_info:
697
  out += params_info + "\n\n"
 
708
 
709
 
710
  async def search_openapi_handler(arguments: dict[str, Any]) -> tuple[str, bool]:
711
+ """Search HuggingFace OpenAPI specification by query and/or tag."""
712
+ tag = arguments.get("tag", "").strip() or None
713
+ query = arguments.get("query", "").strip() or None
714
+
715
+ if not tag and not query:
716
+ return "Error: Provide either 'query' (keyword search) or 'tag' (category filter), or both.", False
717
 
718
  try:
719
+ note = None
 
 
 
 
 
 
 
720
 
721
+ # If query provided, try Whoosh search first
722
+ if query:
723
+ results, search_note = await _search_openapi(query, tag, limit=20)
724
+
725
+ # If Whoosh found results, return them
726
+ if results:
727
+ return _format_openapi_results(results, tag=tag, query=query, note=search_note), True
728
+
729
+ # Whoosh found nothing - fall back to tag-based if tag provided
730
+ if tag:
731
+ note = f"No matches for '{query}'; showing all endpoints in tag '{tag}'"
732
+ else:
733
+ # No tag to fall back to
734
+ return _format_openapi_results([], query=query), True
735
+
736
+ # Tag-based search (either as fallback or primary)
737
+ if tag:
738
+ _, _, endpoints = await _build_openapi_index()
739
+ results = [ep for ep in endpoints if tag in ep.get("tags", "")]
740
+ return _format_openapi_results(results, tag=tag, query=None, note=note), True
741
+
742
+ return "Error: No results found", False
 
 
 
 
 
 
 
 
 
743
 
744
  except httpx.HTTPStatusError as e:
745
  return f"HTTP error fetching OpenAPI spec: {e.response.status_code}", False
 
755
  tags = _extract_all_tags(spec)
756
 
757
  return {
758
+ "name": "find_hf_api",
759
  "description": (
760
+ "Find HuggingFace Hub REST API endpoints to make HTTP requests. Returns curl examples with authentication. "
761
+ "⚠️ USE THIS TOOL when you need to call the HF Hub API directly - for operations like: "
762
+ "uploading/downloading files, managing repos, listing models/datasets, getting user info, "
763
+ "managing webhooks, collections, discussions, or any Hub interaction not covered by other tools. "
764
+ "**Use cases:** (1) 'Stream Space logs' query='space logs', "
765
+ "(2) 'Get Space metrics/Zero-GPU usage' query='space metrics', "
766
+ "(3) 'List organization members' → query='organization members', "
767
+ "(4) 'Generate repo access token' → query='jwt token', "
768
+ "(5) 'Check repo security scan' → query='security scan'. "
769
+ "**Search modes:** Use 'query' for keyword search, 'tag' to browse a category, or both. "
770
+ "If query finds no results, falls back to showing all endpoints in the tag. "
771
+ "**Output:** Full endpoint details with method, path, parameters, curl command, and response schema."
772
  ),
773
  "parameters": {
774
  "type": "object",
775
  "properties": {
776
+ "query": {
777
+ "type": "string",
778
+ "description": (
779
+ "Keyword search across endpoint summaries, descriptions, and operation IDs. "
780
+ "Examples: 'upload file', 'create repository', 'list user models', 'delete branch', "
781
+ "'webhook', 'collection', 'discussion comments'. Supports stemming (upload/uploading both work)."
782
+ ),
783
+ },
784
  "tag": {
785
  "type": "string",
786
  "enum": tags,
787
+ "description": (
788
+ "Filter by API category. Use alone to browse all endpoints in a category, "
789
+ "or combine with 'query' to search within a category."
790
+ ),
791
  },
792
  },
793
+ "required": [],
794
  },
795
  }
796