|
|
"""Example: Documentation Coverage Score |
|
|
|
|
|
Analyzes a GitHub repository and produces a Documentation Coverage Score (0β100) |
|
|
plus the top 5 most important undocumented functions/classes. |
|
|
""" |
|
|
|
|
|
import base64 |
|
|
import os |
|
|
|
|
|
import requests |
|
|
from pydantic import BaseModel, Field |
|
|
|
|
|
from acorn import Module, tool |
|
|
|
|
|
from typing import Optional |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RepoInput(BaseModel): |
|
|
repo_url: str = Field(description="GitHub repository URL, e.g. https://github.com/owner/repo") |
|
|
|
|
|
|
|
|
class DocCoverageReport(BaseModel): |
|
|
score: Optional[int] = Field(description="Documentation coverage score from 0 to 100") |
|
|
summary: str = Field(description="2-3 sentence narrative covering project type, overall documentation state, and the most impactful improvement to make") |
|
|
advice: Optional[str] = Field( |
|
|
description=( |
|
|
"Top 5 most important improvements that can be made to the repo to increase the score" |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SYSTEM_PROMPT = """You are a documentation analyst evaluating the documentation coverage of a GitHub repository. |
|
|
|
|
|
Workflow: |
|
|
1. Parse owner/repo from the repo URL (handle trailing slashes and .git suffixes) |
|
|
2. Call get_file_tree to get a full list of all files in the repo |
|
|
3. Identify the project type (Python library, JS framework, CLI tool, etc.) from |
|
|
file extensions and config files (pyproject.toml, package.json, Cargo.toml, go.mod, etc.) |
|
|
4. Check for standard documentation files by scanning the tree: |
|
|
- README (any casing, .md/.rst/.txt) |
|
|
- LICENSE / COPYING |
|
|
- CONTRIBUTING.md |
|
|
- CHANGELOG / HISTORY |
|
|
- docs/ folder or doc config (mkdocs.yml, docs/conf.py, .readthedocs.yaml) |
|
|
5. Assess project complexity: count source files by language, identify core modules |
|
|
(look for src/, lib/, the package directory matching the project name) |
|
|
6. Read key source files β prioritize: |
|
|
- The main entry point (e.g. __init__.py, index.ts, main.rs, lib.rs) |
|
|
- Public API modules (files that appear to export symbols) |
|
|
- The largest/most central source files |
|
|
Read as many as needed to get a representative sample (aim for 5-10 files). |
|
|
7. In each file, identify public functions, classes, and methods. Check whether |
|
|
they have docstrings (Python), JSDoc comments (JS/TS), or doc comments (Rust/Go). |
|
|
Count documented vs undocumented public symbols. |
|
|
8. Calculate a Documentation Coverage Score (0β100): |
|
|
- Standard docs (up to 30 points): |
|
|
README present: 15 pts | LICENSE: 5 pts | CONTRIBUTING: 5 pts | docs/ folder or CHANGELOG: 5 pts |
|
|
- Inline documentation rate (up to 70 points): |
|
|
(documented public symbols / total public symbols) * 70 |
|
|
Round to nearest integer. |
|
|
9. Identify the top 5 most important undocumented symbols β prioritize: |
|
|
- Public-facing API functions / classes used by external consumers |
|
|
- __init__ / constructor methods of core classes |
|
|
- Functions with many parameters or complex signatures |
|
|
- Entry points and top-level exports |
|
|
10. Return the structured report. |
|
|
""" |
|
|
|
|
|
|
|
|
class DocCoverageAnalyzer(Module): |
|
|
"""Scores the documentation coverage of a GitHub repository.""" |
|
|
|
|
|
system_prompt = SYSTEM_PROMPT |
|
|
model = "anthropic/claude-sonnet-4-6" |
|
|
temperature = 0.3 |
|
|
max_steps = 20 |
|
|
|
|
|
initial_input = RepoInput |
|
|
final_output = DocCoverageReport |
|
|
|
|
|
def __init__(self, github_token: str | None = None): |
|
|
super().__init__() |
|
|
self.github_token = github_token or os.environ.get("GITHUB_TOKEN") |
|
|
|
|
|
def _github_headers(self) -> dict: |
|
|
headers = {"Accept": "application/vnd.github+json"} |
|
|
if self.github_token: |
|
|
headers["Authorization"] = f"Bearer {self.github_token}" |
|
|
return headers |
|
|
|
|
|
@tool |
|
|
def get_file_tree(self, repo_path: str) -> list[str]: |
|
|
"""Get a flat list of all file paths in a GitHub repository. |
|
|
|
|
|
Args: |
|
|
repo_path: Repository in 'owner/repo' format (e.g. 'pallets/flask') |
|
|
|
|
|
Returns: |
|
|
Flat list of all file paths in the repository (blobs only), truncated to 2000 entries |
|
|
""" |
|
|
url = f"https://api.github.com/repos/{repo_path}/git/trees/HEAD?recursive=1" |
|
|
r = requests.get(url, headers=self._github_headers(), timeout=15) |
|
|
r.raise_for_status() |
|
|
tree = r.json().get("tree", []) |
|
|
paths = [entry["path"] for entry in tree if entry.get("type") == "blob"] |
|
|
return paths[:2000] |
|
|
|
|
|
@tool |
|
|
def read_file(self, repo_path: str, file_path: str) -> str: |
|
|
"""Read the contents of a file in a GitHub repository. |
|
|
|
|
|
Args: |
|
|
repo_path: Repository in 'owner/repo' format (e.g. 'pallets/flask') |
|
|
file_path: Path to the file within the repo (e.g. 'README.md' or 'src/main.py') |
|
|
|
|
|
Returns: |
|
|
Decoded text content of the file |
|
|
""" |
|
|
url = f"https://api.github.com/repos/{repo_path}/contents/{file_path.strip('/')}" |
|
|
r = requests.get(url, headers=self._github_headers(), timeout=15) |
|
|
r.raise_for_status() |
|
|
data = r.json() |
|
|
return base64.b64decode(data["content"]).decode("utf-8", errors="replace") |
|
|
|