deepmage121 commited on
Commit
92ea780
·
1 Parent(s): 82484ad

initial commit, space validation stuff

Browse files
.gitignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.so
5
+ *.egg-info/
6
+ dist/
7
+ build/
8
+ .eggs/
9
+ *.egg
10
+ .venv/
11
+ venv/
12
+ env/
13
+ .env
14
+ *.log
15
+ .mypy_cache/
16
+ .pytest_cache/
17
+ .ruff_cache/
18
+ /tmp_data/
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
2
+
3
+ WORKDIR /app
4
+
5
+ ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy
6
+
7
+ COPY pyproject.toml uv.lock ./
8
+ RUN uv sync --locked --no-install-project
9
+
10
+ COPY . /app
11
+ RUN uv sync --locked
12
+
13
+ ENV PATH="/app/.venv/bin:$PATH"
14
+
15
+ ENTRYPOINT []
16
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -4,6 +4,7 @@ emoji: 👀
4
  colorFrom: red
5
  colorTo: purple
6
  sdk: docker
 
7
  pinned: false
8
  ---
9
 
 
4
  colorFrom: red
5
  colorTo: purple
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
  ---
10
 
app.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """EEE Validator — HuggingFace Space webhook handler.
2
+
3
+ Listens for PR events on evaleval/EEE_datastore, validates changed data
4
+ files with Pydantic, checks for duplicates, and comments results on the PR.
5
+ """
6
+
7
+ import logging
8
+ import os
9
+ import tempfile
10
+ import threading
11
+ from datetime import datetime, timezone
12
+
13
+ from huggingface_hub import HfApi, WebhooksServer, webhook_endpoint
14
+ from huggingface_hub import hf_hub_download
15
+ from huggingface_hub.utils import EntryNotFoundError
16
+
17
+ from dedup import DATASET_REPO_ID, DedupReport, check_duplicates, load_manifest
18
+ from validate_data import FileValidationResult, validate_with_pydantic
19
+
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
23
+ )
24
+ logger = logging.getLogger(__name__)
25
+
26
+ api = HfApi()
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Changed file discovery
31
+ # ---------------------------------------------------------------------------
32
+
33
+ def find_changed_files(pr_num: int) -> list[str]:
34
+ """Find added/modified .json and .jsonl files by comparing PR tree to main.
35
+
36
+ Falls back to tree comparison since DiscussionWithDetails.diff can be None
37
+ for dataset repos.
38
+ """
39
+ revision = f"refs/pr/{pr_num}"
40
+
41
+ def _list_files(rev: str) -> dict[str, str]:
42
+ """Return {path: oid} for all files at a given revision."""
43
+ files = {}
44
+ for entry in api.list_repo_tree(
45
+ repo_id=DATASET_REPO_ID,
46
+ repo_type="dataset",
47
+ revision=rev,
48
+ recursive=True,
49
+ ):
50
+ if hasattr(entry, "rfilename"):
51
+ files[entry.rfilename] = getattr(entry, "oid", None)
52
+ return files
53
+
54
+ pr_files = _list_files(revision)
55
+ main_files = _list_files("main")
56
+
57
+ changed: list[str] = []
58
+ for path, oid in pr_files.items():
59
+ if not path.startswith("data/"):
60
+ continue
61
+ if not (path.endswith(".json") or path.endswith(".jsonl")):
62
+ continue
63
+ # New file, or existing file with different content
64
+ if path not in main_files or main_files[path] != oid:
65
+ changed.append(path)
66
+
67
+ return changed
68
+
69
+
70
+ # ---------------------------------------------------------------------------
71
+ # File download
72
+ # ---------------------------------------------------------------------------
73
+
74
+ def download_pr_files(
75
+ file_paths: list[str], pr_num: int, tmp_dir: str
76
+ ) -> dict[str, str]:
77
+ """Download files from a PR branch and return map of repo-path -> local-path."""
78
+ downloaded: dict[str, str] = {}
79
+ revision = f"refs/pr/{pr_num}"
80
+
81
+ for file_path in file_paths:
82
+ try:
83
+ local_path = hf_hub_download(
84
+ repo_id=DATASET_REPO_ID,
85
+ filename=file_path,
86
+ repo_type="dataset",
87
+ revision=revision,
88
+ local_dir=tmp_dir,
89
+ )
90
+ downloaded[file_path] = local_path
91
+ logger.info("Downloaded %s -> %s", file_path, local_path)
92
+ except EntryNotFoundError:
93
+ logger.warning("File not found in PR: %s", file_path)
94
+ except Exception:
95
+ logger.exception("Failed to download %s", file_path)
96
+
97
+ return downloaded
98
+
99
+
100
+ # ---------------------------------------------------------------------------
101
+ # Validation orchestration
102
+ # ---------------------------------------------------------------------------
103
+
104
+ def validate_files(
105
+ downloaded: dict[str, str],
106
+ ) -> list[FileValidationResult]:
107
+ """Validate all downloaded files and return results."""
108
+ results: list[FileValidationResult] = []
109
+
110
+ for repo_path, local_path in downloaded.items():
111
+ if repo_path.endswith(".jsonl"):
112
+ file_type = "jsonl"
113
+ else:
114
+ file_type = "json"
115
+
116
+ result = validate_with_pydantic(local_path, file_type)
117
+ # Store the repo-relative path for reporting
118
+ result.file_path = repo_path
119
+ results.append(result)
120
+
121
+ return results
122
+
123
+
124
+ # ---------------------------------------------------------------------------
125
+ # Deduplication orchestration
126
+ # ---------------------------------------------------------------------------
127
+
128
+ def run_dedup(
129
+ file_paths: list[str], downloaded: dict[str, str]
130
+ ) -> DedupReport:
131
+ """Load manifest and check all files for duplicates."""
132
+ manifest = load_manifest(api)
133
+
134
+ # Read file contents as bytes
135
+ file_contents: dict[str, bytes] = {}
136
+ for repo_path, local_path in downloaded.items():
137
+ with open(local_path, "rb") as f:
138
+ file_contents[repo_path] = f.read()
139
+
140
+ return check_duplicates(file_paths, file_contents, manifest)
141
+
142
+
143
+ # ---------------------------------------------------------------------------
144
+ # Comment formatting
145
+ # ---------------------------------------------------------------------------
146
+
147
+ def format_comment(
148
+ pr_num: int,
149
+ validation_results: list[FileValidationResult],
150
+ dedup_report: DedupReport,
151
+ ) -> str:
152
+ """Format the PR comment as markdown."""
153
+ now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
154
+ num_passed = sum(1 for r in validation_results if r.valid)
155
+ num_failed = sum(1 for r in validation_results if not r.valid)
156
+ total = len(validation_results)
157
+
158
+ lines = [
159
+ "## EEE Validation Report",
160
+ f"**PR:** #{pr_num} | **Run:** {now}",
161
+ "",
162
+ "### Validation Results",
163
+ "| File | Status | Details |",
164
+ "|------|--------|---------|",
165
+ ]
166
+
167
+ for r in validation_results:
168
+ if r.valid:
169
+ type_label = "EvaluationLog" if r.file_type == "json" else "InstanceLevelEvaluationLog"
170
+ lines.append(f"| `{r.file_path}` | PASS | Validated as {type_label} |")
171
+ else:
172
+ # Show first few errors to avoid extremely long comments
173
+ error_summary = "; ".join(r.errors[:5])
174
+ if len(r.errors) > 5:
175
+ error_summary += f" ... and {len(r.errors) - 5} more error(s)"
176
+ lines.append(f"| `{r.file_path}` | FAIL | {error_summary} |")
177
+
178
+ # Dedup section
179
+ lines.append("")
180
+ lines.append("### Duplicate Check")
181
+
182
+ has_any_dupes = False
183
+ for dr in dedup_report.results:
184
+ if dr.exact_duplicate_of:
185
+ lines.append(
186
+ f"- **Exact duplicate:** `{dr.file_path}` is identical to "
187
+ f"existing `{dr.exact_duplicate_of}`"
188
+ )
189
+ has_any_dupes = True
190
+ if dr.near_duplicate_of:
191
+ lines.append(
192
+ f"- **Potential near-duplicate:** `{dr.file_path}` shares fingerprint "
193
+ f"with existing `{dr.near_duplicate_of}` "
194
+ f"(identical content minus timestamps/UUIDs)"
195
+ )
196
+ has_any_dupes = True
197
+
198
+ if not has_any_dupes:
199
+ lines.append("- No exact or near duplicates found.")
200
+
201
+ # Summary
202
+ lines.append("")
203
+ lines.append("### Summary")
204
+ lines.append(f"{total} file(s) checked: {num_passed} passed, {num_failed} failed")
205
+
206
+ return "\n".join(lines)
207
+
208
+
209
+ # ---------------------------------------------------------------------------
210
+ # Core validation logic (shared by webhook + startup sweep)
211
+ # ---------------------------------------------------------------------------
212
+
213
+ REPORT_HEADER = "## EEE Validation Report"
214
+
215
+
216
+ def process_pr(pr_num: int) -> dict:
217
+ """Run full validation + dedup on a PR and post results as a comment."""
218
+ logger.info("Processing PR #%d", pr_num)
219
+
220
+ # Find changed data files by comparing PR tree to main
221
+ changed_files = find_changed_files(pr_num)
222
+ if not changed_files:
223
+ logger.info("No data files changed in PR #%d", pr_num)
224
+ return {"status": "skipped", "reason": "no data files changed"}
225
+
226
+ logger.info("Found %d changed data file(s): %s", len(changed_files), changed_files)
227
+
228
+ # Create temp directory for downloads
229
+ tmp_dir = tempfile.mkdtemp(prefix=f"eee_validate_{pr_num}_")
230
+
231
+ # Download changed files from the PR branch
232
+ downloaded = download_pr_files(changed_files, pr_num, tmp_dir)
233
+ if not downloaded:
234
+ logger.warning("No files could be downloaded for PR #%d", pr_num)
235
+ return {"status": "error", "reason": "no files downloaded"}
236
+
237
+ # Validate files
238
+ validation_results = validate_files(downloaded)
239
+
240
+ # Run dedup check
241
+ dedup_report = run_dedup(changed_files, downloaded)
242
+
243
+ # Format and post comment
244
+ comment = format_comment(pr_num, validation_results, dedup_report)
245
+ logger.info("Posting validation comment on PR #%d", pr_num)
246
+
247
+ api.comment_discussion(
248
+ repo_id=DATASET_REPO_ID,
249
+ discussion_num=pr_num,
250
+ comment=comment,
251
+ repo_type="dataset",
252
+ )
253
+
254
+ return {
255
+ "status": "ok",
256
+ "pr": pr_num,
257
+ "files_checked": len(validation_results),
258
+ "passed": sum(1 for r in validation_results if r.valid),
259
+ "failed": sum(1 for r in validation_results if not r.valid),
260
+ }
261
+
262
+
263
+ # ---------------------------------------------------------------------------
264
+ # Startup sweep — catch PRs missed while the Space was asleep
265
+ # ---------------------------------------------------------------------------
266
+
267
+ def pr_has_validation_comment(pr_num: int) -> bool:
268
+ """Check if a PR already has an EEE Validation Report comment."""
269
+ details = api.get_discussion_details(
270
+ repo_id=DATASET_REPO_ID,
271
+ discussion_num=pr_num,
272
+ repo_type="dataset",
273
+ )
274
+ for event in details.events:
275
+ if event.type == "comment" and event.content and event.content.startswith(REPORT_HEADER):
276
+ return True
277
+ return False
278
+
279
+
280
+ def startup_sweep() -> None:
281
+ """Scan open PRs and validate any that are missing a report."""
282
+ logger.info("Running startup sweep for unvalidated PRs...")
283
+ try:
284
+ discussions = api.get_repo_discussions(
285
+ repo_id=DATASET_REPO_ID,
286
+ repo_type="dataset",
287
+ )
288
+ for disc in discussions:
289
+ if not disc.is_pull_request or disc.status != "open":
290
+ continue
291
+ if pr_has_validation_comment(disc.num):
292
+ logger.info("PR #%d already has a validation report, skipping", disc.num)
293
+ continue
294
+ logger.info("PR #%d has no validation report, processing", disc.num)
295
+ try:
296
+ process_pr(disc.num)
297
+ except Exception:
298
+ logger.exception("Startup sweep failed for PR #%d", disc.num)
299
+ except Exception:
300
+ logger.exception("Startup sweep failed to list discussions")
301
+ logger.info("Startup sweep complete")
302
+
303
+
304
+ # Run sweep in background thread so it doesn't block the webhook server startup
305
+ threading.Thread(target=startup_sweep, daemon=True).start()
306
+
307
+
308
+ # ---------------------------------------------------------------------------
309
+ # Webhook endpoint
310
+ # ---------------------------------------------------------------------------
311
+
312
+ @webhook_endpoint
313
+ async def validate(payload):
314
+ """Handle incoming webhook events from HuggingFace."""
315
+ logger.info("Received webhook event: %s", payload.event)
316
+
317
+ # Filter: only dataset PRs, ignore comments
318
+ if payload.event.scope == "discussion.comment":
319
+ logger.info("Skipping comment event")
320
+ return {"status": "skipped", "reason": "comment event"}
321
+
322
+ if payload.repo.type != "dataset":
323
+ logger.info("Skipping non-dataset event (type=%s)", payload.repo.type)
324
+ return {"status": "skipped", "reason": "not a dataset repo"}
325
+
326
+ if not payload.discussion or not payload.discussion.isPullRequest:
327
+ logger.info("Skipping non-PR event")
328
+ return {"status": "skipped", "reason": "not a pull request"}
329
+
330
+ pr_num = payload.discussion.num
331
+
332
+ try:
333
+ return process_pr(pr_num)
334
+ except Exception:
335
+ logger.exception("Failed to process PR #%d", pr_num)
336
+ return {"status": "error", "reason": "processing failed"}
dedup.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deduplication module for EEE validation pipeline.
2
+
3
+ Two-level dedup:
4
+ - Exact duplicates: SHA256 hash of entire file content
5
+ - Near duplicates: SHA256 hash of content minus timestamps/UUIDs
6
+ """
7
+
8
+ import hashlib
9
+ import json
10
+ import logging
11
+ from dataclasses import dataclass, field
12
+ from typing import Any
13
+
14
+ from huggingface_hub import HfApi, hf_hub_download
15
+ from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ DATASET_REPO_ID = "evaleval/EEE_datastore"
20
+ MANIFEST_PATH = "manifest.json"
21
+
22
+ # Fields to strip for near-duplicate fingerprinting
23
+ FINGERPRINT_STRIP_FIELDS = {
24
+ "retrieved_timestamp",
25
+ "evaluation_id",
26
+ "evaluation_timestamp",
27
+ }
28
+
29
+
30
+ def compute_sha256(content: bytes) -> str:
31
+ return hashlib.sha256(content).hexdigest()
32
+
33
+
34
+ def _strip_fields(data: dict[str, Any], fields_to_strip: set[str]) -> dict[str, Any]:
35
+ """Recursively strip specified fields from a dict for fingerprinting."""
36
+ result = {}
37
+ for key, value in data.items():
38
+ if key in fields_to_strip:
39
+ continue
40
+ if isinstance(value, dict):
41
+ result[key] = _strip_fields(value, fields_to_strip)
42
+ elif isinstance(value, list):
43
+ result[key] = [
44
+ _strip_fields(item, fields_to_strip) if isinstance(item, dict) else item
45
+ for item in value
46
+ ]
47
+ else:
48
+ result[key] = value
49
+ return result
50
+
51
+
52
+ def compute_fingerprint(content: bytes) -> str:
53
+ """Compute a near-duplicate fingerprint by hashing content minus timestamps/UUIDs."""
54
+ try:
55
+ data = json.loads(content)
56
+ except (json.JSONDecodeError, UnicodeDecodeError):
57
+ # If we can't parse as JSON, fall back to full content hash
58
+ return compute_sha256(content)
59
+
60
+ stripped = _strip_fields(data, FINGERPRINT_STRIP_FIELDS)
61
+ # Serialize deterministically
62
+ canonical = json.dumps(stripped, sort_keys=True, ensure_ascii=True).encode()
63
+ return hashlib.sha256(canonical).hexdigest()
64
+
65
+
66
+ @dataclass
67
+ class DedupResult:
68
+ """Results of deduplication check for a single file."""
69
+ file_path: str
70
+ sha256: str
71
+ fingerprint: str
72
+ exact_duplicate_of: str | None = None
73
+ near_duplicate_of: str | None = None
74
+
75
+
76
+ @dataclass
77
+ class DedupReport:
78
+ """Aggregated dedup report across all checked files."""
79
+ results: list[DedupResult] = field(default_factory=list)
80
+
81
+ @property
82
+ def has_exact_duplicates(self) -> bool:
83
+ return any(r.exact_duplicate_of is not None for r in self.results)
84
+
85
+ @property
86
+ def has_near_duplicates(self) -> bool:
87
+ return any(r.near_duplicate_of is not None for r in self.results)
88
+
89
+
90
+ def load_manifest(api: HfApi) -> dict[str, Any]:
91
+ """Download and parse manifest.json from the dataset repo's main branch."""
92
+ try:
93
+ manifest_file = hf_hub_download(
94
+ repo_id=DATASET_REPO_ID,
95
+ filename=MANIFEST_PATH,
96
+ repo_type="dataset",
97
+ revision="main",
98
+ )
99
+ with open(manifest_file, "r") as f:
100
+ return json.load(f)
101
+ except (EntryNotFoundError, RepositoryNotFoundError):
102
+ logger.warning("manifest.json not found in %s, using empty manifest", DATASET_REPO_ID)
103
+ return {"files": {}}
104
+ except Exception:
105
+ logger.exception("Failed to load manifest.json")
106
+ return {"files": {}}
107
+
108
+
109
+ def check_duplicates(
110
+ file_paths: list[str],
111
+ file_contents: dict[str, bytes],
112
+ manifest: dict[str, Any],
113
+ ) -> DedupReport:
114
+ """Check files against the manifest for exact and near duplicates.
115
+
116
+ Args:
117
+ file_paths: Repo-relative paths of files to check (e.g. "data/gsm8k/.../abc.json")
118
+ file_contents: Map of repo-relative path -> raw file bytes
119
+ manifest: Parsed manifest.json with "files" key
120
+ """
121
+ report = DedupReport()
122
+ manifest_files = manifest.get("files", {})
123
+
124
+ # Build reverse lookups from manifest
125
+ sha256_to_path: dict[str, str] = {}
126
+ fingerprint_to_path: dict[str, str] = {}
127
+ for path, entry in manifest_files.items():
128
+ sha256_to_path[entry["sha256"]] = path
129
+ fingerprint_to_path[entry["fingerprint"]] = path
130
+
131
+ for file_path in file_paths:
132
+ content = file_contents.get(file_path)
133
+ if content is None:
134
+ continue
135
+
136
+ sha256 = compute_sha256(content)
137
+
138
+ # Only compute fingerprints for .json files (not .jsonl)
139
+ if file_path.endswith(".json"):
140
+ fingerprint = compute_fingerprint(content)
141
+ else:
142
+ fingerprint = sha256 # For JSONL, fingerprint == sha256
143
+
144
+ result = DedupResult(
145
+ file_path=file_path,
146
+ sha256=sha256,
147
+ fingerprint=fingerprint,
148
+ )
149
+
150
+ # Check exact duplicate
151
+ if sha256 in sha256_to_path and sha256_to_path[sha256] != file_path:
152
+ result.exact_duplicate_of = sha256_to_path[sha256]
153
+
154
+ # Check near duplicate (only if not already an exact duplicate)
155
+ if (
156
+ result.exact_duplicate_of is None
157
+ and fingerprint in fingerprint_to_path
158
+ and fingerprint_to_path[fingerprint] != file_path
159
+ ):
160
+ result.near_duplicate_of = fingerprint_to_path[fingerprint]
161
+
162
+ report.results.append(result)
163
+
164
+ return report
eval.schema.json ADDED
@@ -0,0 +1,653 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "version": "0.2.0",
4
+ "type": "object",
5
+ "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
6
+ "required": [
7
+ "schema_version",
8
+ "evaluation_id",
9
+ "retrieved_timestamp",
10
+ "source_metadata",
11
+ "model_info",
12
+ "evaluation_results"
13
+ ],
14
+ "additionalProperties": false,
15
+ "properties": {
16
+ "schema_version": {
17
+ "type": "string",
18
+ "description": "Version of the schema used for this evaluation data"
19
+ },
20
+ "evaluation_id": {
21
+ "type": "string",
22
+ "description": "Unique identifier for this specific evaluation run. Use eval_name/model_id/retrieved_timestamp format"
23
+ },
24
+ "evaluation_timestamp": {
25
+ "type": "string",
26
+ "description": "Timestamp for when the evaluation was run"
27
+ },
28
+ "retrieved_timestamp": {
29
+ "type": "string",
30
+ "description": "Timestamp for when this record was created - using Unix Epoch time format"
31
+ },
32
+ "source_metadata": {
33
+ "type": "object",
34
+ "description": "Metadata about the source of the leaderboard data",
35
+ "required": [
36
+ "source_type",
37
+ "source_organization_name",
38
+ "evaluator_relationship"
39
+ ],
40
+ "properties": {
41
+ "source_name": {
42
+ "type": "string",
43
+ "description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation)."
44
+ },
45
+ "source_type": {
46
+ "type": "string",
47
+ "enum": [
48
+ "documentation",
49
+ "evaluation_run"
50
+ ],
51
+ "description": "Whether the data comes from a direct evaluation run or from documentation"
52
+ },
53
+ "source_organization_name": {
54
+ "type": "string",
55
+ "description": "Name of the organization that provides the data"
56
+ },
57
+ "source_organization_url": {
58
+ "type": "string",
59
+ "description": "URL for the organization that provides the data"
60
+ },
61
+ "source_organization_logo_url": {
62
+ "type": "string",
63
+ "description": "URL for the Logo for the organization that provides the data"
64
+ },
65
+ "evaluator_relationship": {
66
+ "type": "string",
67
+ "description": "Relationship between the evaluator and the model",
68
+ "enum": [
69
+ "first_party",
70
+ "third_party",
71
+ "collaborative",
72
+ "other"
73
+ ]
74
+ }
75
+ }
76
+ },
77
+ "model_info": {
78
+ "$ref": "#/$defs/model_info"
79
+ },
80
+ "evaluation_results": {
81
+ "type": "array",
82
+ "description": "Array of evaluation results",
83
+ "items": {
84
+ "type": "object",
85
+ "required": [
86
+ "evaluation_name",
87
+ "source_data",
88
+ "metric_config",
89
+ "score_details"
90
+ ],
91
+ "properties": {
92
+ "evaluation_name": {
93
+ "type": "string",
94
+ "description": "Name of the evaluation"
95
+ },
96
+ "source_data": {
97
+ "description": "Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.",
98
+ "oneOf": [
99
+ {
100
+ "$ref": "#/$defs/source_data_url"
101
+ },
102
+ {
103
+ "$ref": "#/$defs/source_data_hf"
104
+ },
105
+ {
106
+ "$ref": "#/$defs/source_data_private"
107
+ }
108
+ ]
109
+ },
110
+ "evaluation_timestamp": {
111
+ "type": "string",
112
+ "description": "Timestamp for when the evaluations were run"
113
+ },
114
+ "metric_config": {
115
+ "type": "object",
116
+ "description": "Details about the metric",
117
+ "required": [
118
+ "lower_is_better"
119
+ ],
120
+ "properties": {
121
+ "evaluation_description": {
122
+ "type": "string",
123
+ "description": "Description of the evaluation"
124
+ },
125
+ "lower_is_better": {
126
+ "type": "boolean",
127
+ "description": "Whether a lower score is better"
128
+ },
129
+ "score_type": {
130
+ "type": "string",
131
+ "description": "Type of score",
132
+ "enum": [
133
+ "binary",
134
+ "continuous",
135
+ "levels"
136
+ ]
137
+ },
138
+ "level_names": {
139
+ "type": "array",
140
+ "description": "Names of the score levels",
141
+ "items": {
142
+ "type": "string"
143
+ }
144
+ },
145
+ "level_metadata": {
146
+ "type": "array",
147
+ "description": "Additional Description for each Score Level",
148
+ "items": {
149
+ "type": "string"
150
+ }
151
+ },
152
+ "has_unknown_level": {
153
+ "type": "boolean",
154
+ "description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown"
155
+ },
156
+ "min_score": {
157
+ "type": "number",
158
+ "description": "Minimum possible score for continuous metric"
159
+ },
160
+ "max_score": {
161
+ "type": "number",
162
+ "description": "Maximum possible score for continuous metric"
163
+ },
164
+ "llm_scoring": {
165
+ "type": "object",
166
+ "description": "Configuration when LLM is used as scorer/judge",
167
+ "additionalProperties": true,
168
+ "required": [
169
+ "judges",
170
+ "input_prompt"
171
+ ],
172
+ "properties": {
173
+ "judges": {
174
+ "type": "array",
175
+ "description": "LLM judge(s) - single item for judge, multiple for jury",
176
+ "items": {
177
+ "$ref": "#/$defs/judge_config"
178
+ },
179
+ "minItems": 1
180
+ },
181
+ "input_prompt": {
182
+ "type": "string",
183
+ "description": "Prompt template used for judging"
184
+ },
185
+ "aggregation_method": {
186
+ "type": "string",
187
+ "enum": [
188
+ "majority_vote",
189
+ "average",
190
+ "weighted_average",
191
+ "median"
192
+ ],
193
+ "description": "How to aggregate scores when multiple judges"
194
+ },
195
+ "expert_baseline": {
196
+ "type": "number",
197
+ "description": "Expert/human baseline score for comparison"
198
+ },
199
+ "additional_details": {
200
+ "$ref": "#/$defs/additional_properties_object"
201
+ }
202
+ }
203
+ }
204
+ },
205
+ "if": {
206
+ "properties": {
207
+ "score_type": {
208
+ "const": "levels"
209
+ }
210
+ }
211
+ },
212
+ "then": {
213
+ "required": [
214
+ "level_names",
215
+ "has_unknown_level"
216
+ ]
217
+ },
218
+ "else": {
219
+ "if": {
220
+ "properties": {
221
+ "score_type": {
222
+ "const": "continuous"
223
+ }
224
+ }
225
+ },
226
+ "then": {
227
+ "required": [
228
+ "min_score",
229
+ "max_score"
230
+ ]
231
+ }
232
+ }
233
+ },
234
+ "score_details": {
235
+ "type" : "object",
236
+ "description": "The score for the evaluation and related details",
237
+ "required": [
238
+ "score"
239
+ ],
240
+ "properties": {
241
+ "score": {
242
+ "type": "number",
243
+ "description": "The score for the evaluation"
244
+ },
245
+ "details": {
246
+ "$ref": "#/$defs/additional_properties_object"
247
+ },
248
+ "uncertainty": {
249
+ "type": "object",
250
+ "description": "Quantification of uncertainty around the reported score",
251
+ "properties": {
252
+ "standard_error": {
253
+ "type": "object",
254
+ "description": "Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))",
255
+ "properties": {
256
+ "value": {
257
+ "type": "number",
258
+ "description": "The standard error value"
259
+ },
260
+ "method": {
261
+ "type": "string",
262
+ "description": "How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')"
263
+ }
264
+ },
265
+ "required": ["value"]
266
+ },
267
+ "confidence_interval": {
268
+ "type": "object",
269
+ "description": "Lower and upper bounds for the metric at a given confidence level.",
270
+ "properties": {
271
+ "lower": {
272
+ "type": "number",
273
+ "description": "Lower bound of the confidence interval"
274
+ },
275
+ "upper": {
276
+ "type": "number",
277
+ "description": "Upper bound of the confidence interval"
278
+ },
279
+ "confidence_level": {
280
+ "type": "number",
281
+ "description": "Confidence level (e.g. 0.95 for a 95% confidence interval)",
282
+ "minimum": 0,
283
+ "maximum": 1
284
+ },
285
+ "method": {
286
+ "type": "string",
287
+ "description": "How the confidence interval was computed"
288
+ }
289
+ },
290
+ "required": ["lower", "upper"]
291
+ },
292
+ "standard_deviation": {
293
+ "type": "number",
294
+ "description": "Standard deviation of the per-sample scores"
295
+ },
296
+ "num_samples": {
297
+ "type": "integer",
298
+ "description": "Number of samples used to compute the uncertainty estimates"
299
+ },
300
+ "num_bootstrap_samples": {
301
+ "type": "integer",
302
+ "description": "Number of bootstrap resamples used, if bootstrap method was applied"
303
+ }
304
+ }
305
+ }
306
+ }
307
+ },
308
+ "generation_config": {
309
+ "type": "object",
310
+ "properties": {
311
+ "generation_args": {
312
+ "type": "object",
313
+ "description": "Parameters used to generate results - properties may vary by model type",
314
+ "properties": {
315
+ "temperature": {
316
+ "type": [
317
+ "null",
318
+ "number"
319
+ ],
320
+ "description": "Sampling temperature"
321
+ },
322
+ "top_p": {
323
+ "type": [
324
+ "null",
325
+ "number"
326
+ ],
327
+ "description": "Nucleus sampling parameter"
328
+ },
329
+ "top_k": {
330
+ "type": [
331
+ "null",
332
+ "number"
333
+ ],
334
+ "description": "Top-k sampling parameter"
335
+ },
336
+ "max_tokens": {
337
+ "type": "integer",
338
+ "minimum": 1,
339
+ "description": "Maximum number of tokens to generate"
340
+ },
341
+ "execution_command": {
342
+ "type": "string",
343
+ "description": "Command used to run the model to generate results"
344
+ },
345
+ "reasoning": {
346
+ "type": "boolean",
347
+ "description": "Whether reasoning orchain-of-thought was used to generate results"
348
+ },
349
+ "prompt_template": {
350
+ "type": "string",
351
+ "description": "Input prompt template for task (should contain agentic info if needed)."
352
+ },
353
+ "agentic_eval_config": {
354
+ "type": "object",
355
+ "description": "General configuration for agentic evaluations.",
356
+ "properties": {
357
+ "available_tools": {
358
+ "type": "array",
359
+ "description": "List of all available tools with their configurations",
360
+ "items": {
361
+ "type": "object",
362
+ "properties": {
363
+ "name": {
364
+ "type": "string",
365
+ "description": "e.g. bash, calculator, ..."
366
+ },
367
+ "description": {
368
+ "type": "string"
369
+ },
370
+ "parameters": {
371
+ "$ref": "#/$defs/additional_properties_object"
372
+ }
373
+ }
374
+ }
375
+ },
376
+ "additional_details": {
377
+ "$ref": "#/$defs/additional_properties_object"
378
+ }
379
+ }
380
+ },
381
+ "eval_plan": {
382
+ "type": "object",
383
+ "description": "Plan (solvers) used in evaluation. Solvers are crucial parts of Inspect evaluations which can serve a wide variety of purposes like providing system prompts, prompt engineering, model generation or multi-turn dialog.",
384
+ "properties": {
385
+ "name": {
386
+ "type": "string"
387
+ },
388
+ "steps": {
389
+ "type": "array",
390
+ "description": "Array of evaluation plan steps",
391
+ "items": {
392
+ "solver": {
393
+ "type": "string",
394
+ "description": "Name of solver e.g. system_message, react."
395
+ },
396
+ "parameters": {
397
+ "$ref": "#/$defs/additional_properties_object"
398
+ }
399
+ }
400
+ },
401
+ "config": {
402
+ "$ref": "#/$defs/additional_properties_object"
403
+ }
404
+ }
405
+ },
406
+ "eval_limits": {
407
+ "type": "object",
408
+ "description": "Listed evaluation limits like time limit, message limit, token limit.",
409
+ "properties": {
410
+ "time_limit": {
411
+ "type": "integer",
412
+ "description": "Time limit for evaluation."
413
+ },
414
+ "message_limit": {
415
+ "type": "integer",
416
+ "description": "Message limit for evaluation."
417
+ },
418
+ "token_limit": {
419
+ "type": "integer",
420
+ "description": "Token limit for evaluation."
421
+ }
422
+ }
423
+ },
424
+ "sandbox": {
425
+ "type": "object",
426
+ "properties": {
427
+ "type": {
428
+ "type": "string",
429
+ "description": "Type of sandbox e.g. docker"
430
+ },
431
+ "config": {
432
+ "type": "string",
433
+ "description": "Config file name/path e.g. compose.yaml. TODO or full config? Not sure based on the Inspect docs"
434
+ }
435
+ }
436
+ },
437
+ "max_attempts": {
438
+ "type": "integer",
439
+ "description": "Maximum number of submission attempts (default 1).",
440
+ "default": 1
441
+ },
442
+ "incorrect_attempt_feedback": {
443
+ "type": "string",
444
+ "description": "Feedback from the model after incorrect attempt."
445
+ }
446
+ },
447
+ "additionalProperties": true
448
+ },
449
+ "additional_details": {
450
+ "$ref": "#/$defs/additional_properties_object"
451
+ }
452
+ }
453
+ }
454
+ }
455
+ }
456
+ },
457
+ "detailed_evaluation_results": {
458
+ "description": "Reference to the evaluation results for all individual samples in the evaluation",
459
+ "properties": {
460
+ "format": {
461
+ "type": "string",
462
+ "description": "Format of the detailed evaluation results",
463
+ "enum": [
464
+ "jsonl",
465
+ "json"
466
+ ]
467
+ },
468
+ "file_path": {
469
+ "type": "string",
470
+ "description": "Path to the detailed evaluation results file"
471
+ },
472
+ "hash_algorithm": {
473
+ "type": "string",
474
+ "description": "Hash algorithm used for checksum and sample_hash in instance-level data",
475
+ "enum": [
476
+ "sha256",
477
+ "md5"
478
+ ]
479
+ },
480
+ "checksum": {
481
+ "type": "string",
482
+ "description": "Checksum value of the file"
483
+ },
484
+ "total_rows": {
485
+ "type": "integer",
486
+ "description": "Total number of rows in the detailed evaluation results file"
487
+ }
488
+ }
489
+ }
490
+ },
491
+ "$defs": {
492
+ "additional_properties_object": {
493
+ "type": "object",
494
+ "description": "Additional parameters (key-value object)",
495
+ "additionalProperties": true
496
+ },
497
+ "judge_config": {
498
+ "type": "object",
499
+ "description": "Configuration for a single LLM judge/juror",
500
+ "required": [
501
+ "model_info"
502
+ ],
503
+ "properties": {
504
+ "model_info": {
505
+ "$ref": "#/$defs/model_info"
506
+ },
507
+ "temperature": {
508
+ "type": "number"
509
+ },
510
+ "weight": {
511
+ "type": "number",
512
+ "description": "Weight of this judge's score in aggregation (used in jury)"
513
+ }
514
+ }
515
+ },
516
+ "model_info": {
517
+ "type": "object",
518
+ "description": "Complete model specification including basic information, technical configuration and inference settings",
519
+ "required": [
520
+ "name",
521
+ "id"
522
+ ],
523
+ "properties": {
524
+ "name": {
525
+ "type": "string",
526
+ "description": "Model name provided by evaluation source"
527
+ },
528
+ "id": {
529
+ "type": "string",
530
+ "description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)"
531
+ },
532
+ "developer": {
533
+ "type": "string",
534
+ "description": "Name of organization that provides the model (e.g. 'OpenAI')"
535
+ },
536
+ "inference_platform": {
537
+ "type": "string",
538
+ "description": "Name of inference platform which provides an access to models by API to run the evaluations or provides models weights to run them locally (e.g. HuggingFace, Bedrock, Together AI)"
539
+ },
540
+ "inference_engine": {
541
+ "type": "object",
542
+ "description": "Name of inference engine which provides an access to optimized models to use them for local evaluations (e.g. vLLM, Ollama).",
543
+ "properties": {
544
+ "name": {
545
+ "type": "string",
546
+ "description": "Name of the inference engine"
547
+ },
548
+ "version": {
549
+ "type": "string",
550
+ "description": "Version of the inference engine"
551
+ }
552
+ }
553
+ },
554
+ "additional_details": {
555
+ "$ref": "#/$defs/additional_properties_object"
556
+ }
557
+ }
558
+ },
559
+ "source_data_url": {
560
+ "type": "object",
561
+ "description": "URL source for the evaluation data",
562
+ "required": [
563
+ "dataset_name",
564
+ "source_type",
565
+ "url"
566
+ ],
567
+ "additionalProperties": true,
568
+ "properties": {
569
+ "dataset_name": {
570
+ "type": "string",
571
+ "description": "Name of the source dataset"
572
+ },
573
+ "source_type": {
574
+ "const": "url"
575
+ },
576
+ "url": {
577
+ "type": "array",
578
+ "items": {
579
+ "type": "string"
580
+ },
581
+ "minItems": 1,
582
+ "description": "URL(s) for the source of the evaluation data"
583
+ },
584
+ "additional_details": {
585
+ "$ref": "#/$defs/additional_properties_object"
586
+ }
587
+ }
588
+ },
589
+ "source_data_hf": {
590
+ "type": "object",
591
+ "description": "Details about HuggingFace dataset used for evaluation",
592
+ "required": [
593
+ "dataset_name",
594
+ "source_type"
595
+ ],
596
+ "additionalProperties": true,
597
+ "properties": {
598
+ "dataset_name": {
599
+ "type": "string",
600
+ "description": "Name of the source dataset"
601
+ },
602
+ "source_type": {
603
+ "const": "hf_dataset"
604
+ },
605
+ "hf_repo": {
606
+ "type": "string",
607
+ "description": "HuggingFace repository identifier"
608
+ },
609
+ "hf_split": {
610
+ "type": "string",
611
+ "description": "One of train, val or test."
612
+ },
613
+ "samples_number": {
614
+ "type": "integer",
615
+ "description": "Number of samples in the dataset"
616
+ },
617
+ "sample_ids": {
618
+ "type": "array",
619
+ "description": "Array of sample ids used for evaluation",
620
+ "items": {
621
+ "type": [
622
+ "integer",
623
+ "string"
624
+ ]
625
+ }
626
+ },
627
+ "additional_details": {
628
+ "$ref": "#/$defs/additional_properties_object"
629
+ }
630
+ }
631
+ },
632
+ "source_data_private": {
633
+ "type": "object",
634
+ "description": "Generic source data when neither URL array nor HuggingFace dataset applies",
635
+ "required": [
636
+ "dataset_name",
637
+ "source_type"
638
+ ],
639
+ "properties": {
640
+ "dataset_name": {
641
+ "type": "string",
642
+ "description": "Name of the source dataset"
643
+ },
644
+ "source_type": {
645
+ "const": "other"
646
+ },
647
+ "additional_details": {
648
+ "$ref": "#/$defs/additional_properties_object"
649
+ }
650
+ }
651
+ }
652
+ }
653
+ }
eval_types.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generated by datamodel-codegen:
2
+ # filename: eval.schema.json
3
+ # timestamp: 2026-02-11T15:00:05+00:00
4
+
5
+ from __future__ import annotations
6
+ from enum import Enum
7
+ from pydantic import BaseModel, ConfigDict, Field, confloat, conint
8
+ from typing import Any, Literal
9
+
10
+
11
+ class SourceType(Enum):
12
+ documentation = "documentation"
13
+ evaluation_run = "evaluation_run"
14
+
15
+
16
+ class EvaluatorRelationship(Enum):
17
+ first_party = "first_party"
18
+ third_party = "third_party"
19
+ collaborative = "collaborative"
20
+ other = "other"
21
+
22
+
23
+ class SourceMetadata(BaseModel):
24
+ source_name: str | None = Field(
25
+ None,
26
+ description="Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation).",
27
+ )
28
+ source_type: SourceType = Field(
29
+ ...,
30
+ description="Whether the data comes from a direct evaluation run or from documentation",
31
+ )
32
+ source_organization_name: str = Field(
33
+ ..., description="Name of the organization that provides the data"
34
+ )
35
+ source_organization_url: str | None = Field(
36
+ None, description="URL for the organization that provides the data"
37
+ )
38
+ source_organization_logo_url: str | None = Field(
39
+ None, description="URL for the Logo for the organization that provides the data"
40
+ )
41
+ evaluator_relationship: EvaluatorRelationship = Field(
42
+ ..., description="Relationship between the evaluator and the model"
43
+ )
44
+
45
+
46
+ class ScoreType(Enum):
47
+ binary = "binary"
48
+ continuous = "continuous"
49
+ levels = "levels"
50
+
51
+
52
+ class AggregationMethod(Enum):
53
+ majority_vote = "majority_vote"
54
+ average = "average"
55
+ weighted_average = "weighted_average"
56
+ median = "median"
57
+
58
+
59
+ class StandardError(BaseModel):
60
+ value: float = Field(..., description="The standard error value")
61
+ method: str | None = Field(
62
+ None,
63
+ description="How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')",
64
+ )
65
+
66
+
67
+ class ConfidenceInterval(BaseModel):
68
+ lower: float = Field(..., description="Lower bound of the confidence interval")
69
+ upper: float = Field(..., description="Upper bound of the confidence interval")
70
+ confidence_level: confloat(ge=0.0, le=1.0) | None = Field(
71
+ None, description="Confidence level (e.g. 0.95 for a 95% confidence interval)"
72
+ )
73
+ method: str | None = Field(
74
+ None, description="How the confidence interval was computed"
75
+ )
76
+
77
+
78
+ class Uncertainty(BaseModel):
79
+ standard_error: StandardError | None = Field(
80
+ None,
81
+ description="Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))",
82
+ )
83
+ confidence_interval: ConfidenceInterval | None = Field(
84
+ None,
85
+ description="Lower and upper bounds for the metric at a given confidence level.",
86
+ )
87
+ standard_deviation: float | None = Field(
88
+ None, description="Standard deviation of the per-sample scores"
89
+ )
90
+ num_samples: int | None = Field(
91
+ None, description="Number of samples used to compute the uncertainty estimates"
92
+ )
93
+ num_bootstrap_samples: int | None = Field(
94
+ None,
95
+ description="Number of bootstrap resamples used, if bootstrap method was applied",
96
+ )
97
+
98
+
99
+ class EvalLimits(BaseModel):
100
+ time_limit: int | None = Field(None, description="Time limit for evaluation.")
101
+ message_limit: int | None = Field(None, description="Message limit for evaluation.")
102
+ token_limit: int | None = Field(None, description="Token limit for evaluation.")
103
+
104
+
105
+ class Sandbox(BaseModel):
106
+ type: str | None = Field(None, description="Type of sandbox e.g. docker")
107
+ config: str | None = Field(
108
+ None,
109
+ description="Config file name/path e.g. compose.yaml. TODO or full config? Not sure based on the Inspect docs",
110
+ )
111
+
112
+
113
+ class Format(Enum):
114
+ jsonl = "jsonl"
115
+ json = "json"
116
+
117
+
118
+ class HashAlgorithm(Enum):
119
+ sha256 = "sha256"
120
+ md5 = "md5"
121
+
122
+
123
+ class DetailedEvaluationResults(BaseModel):
124
+ format: Format | None = Field(
125
+ None, description="Format of the detailed evaluation results"
126
+ )
127
+ file_path: str | None = Field(
128
+ None, description="Path to the detailed evaluation results file"
129
+ )
130
+ hash_algorithm: HashAlgorithm | None = Field(
131
+ None,
132
+ description="Hash algorithm used for checksum and sample_hash in instance-level data",
133
+ )
134
+ checksum: str | None = Field(None, description="Checksum value of the file")
135
+ total_rows: int | None = Field(
136
+ None, description="Total number of rows in the detailed evaluation results file"
137
+ )
138
+
139
+
140
+ class AdditionalPropertiesObject(BaseModel):
141
+ model_config = ConfigDict(
142
+ extra="allow",
143
+ )
144
+
145
+
146
+ class InferenceEngine(BaseModel):
147
+ name: str | None = Field(None, description="Name of the inference engine")
148
+ version: str | None = Field(None, description="Version of the inference engine")
149
+
150
+
151
+ class ModelInfo(BaseModel):
152
+ name: str = Field(..., description="Model name provided by evaluation source")
153
+ id: str = Field(
154
+ ...,
155
+ description="Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)",
156
+ )
157
+ developer: str | None = Field(
158
+ None, description="Name of organization that provides the model (e.g. 'OpenAI')"
159
+ )
160
+ inference_platform: str | None = Field(
161
+ None,
162
+ description="Name of inference platform which provides an access to models by API to run the evaluations or provides models weights to run them locally (e.g. HuggingFace, Bedrock, Together AI)",
163
+ )
164
+ inference_engine: InferenceEngine | None = Field(
165
+ None,
166
+ description="Name of inference engine which provides an access to optimized models to use them for local evaluations (e.g. vLLM, Ollama).",
167
+ )
168
+ additional_details: AdditionalPropertiesObject | None = None
169
+
170
+
171
+ class SourceDataUrl(BaseModel):
172
+ model_config = ConfigDict(
173
+ extra="allow",
174
+ )
175
+ dataset_name: str = Field(..., description="Name of the source dataset")
176
+ source_type: Literal["url"]
177
+ url: list[str] = Field(
178
+ ..., description="URL(s) for the source of the evaluation data", min_length=1
179
+ )
180
+ additional_details: AdditionalPropertiesObject | None = None
181
+
182
+
183
+ class SourceDataHf(BaseModel):
184
+ model_config = ConfigDict(
185
+ extra="allow",
186
+ )
187
+ dataset_name: str = Field(..., description="Name of the source dataset")
188
+ source_type: Literal["hf_dataset"]
189
+ hf_repo: str | None = Field(None, description="HuggingFace repository identifier")
190
+ hf_split: str | None = Field(None, description="One of train, val or test.")
191
+ samples_number: int | None = Field(
192
+ None, description="Number of samples in the dataset"
193
+ )
194
+ sample_ids: list[int | str] | None = Field(
195
+ None, description="Array of sample ids used for evaluation"
196
+ )
197
+ additional_details: AdditionalPropertiesObject | None = None
198
+
199
+
200
+ class SourceDataPrivate(BaseModel):
201
+ dataset_name: str = Field(..., description="Name of the source dataset")
202
+ source_type: Literal["other"]
203
+ additional_details: AdditionalPropertiesObject | None = None
204
+
205
+
206
+ class ScoreDetails(BaseModel):
207
+ score: float = Field(..., description="The score for the evaluation")
208
+ details: AdditionalPropertiesObject | None = None
209
+ uncertainty: Uncertainty | None = Field(
210
+ None, description="Quantification of uncertainty around the reported score"
211
+ )
212
+
213
+
214
+ class AvailableTool(BaseModel):
215
+ name: str | None = Field(None, description="e.g. bash, calculator, ...")
216
+ description: str | None = None
217
+ parameters: AdditionalPropertiesObject | None = None
218
+
219
+
220
+ class AgenticEvalConfig(BaseModel):
221
+ available_tools: list[AvailableTool] | None = Field(
222
+ None, description="List of all available tools with their configurations"
223
+ )
224
+ additional_details: AdditionalPropertiesObject | None = None
225
+
226
+
227
+ class EvalPlan(BaseModel):
228
+ name: str | None = None
229
+ steps: list[Any] | None = Field(None, description="Array of evaluation plan steps")
230
+ config: AdditionalPropertiesObject | None = None
231
+
232
+
233
+ class GenerationArgs(BaseModel):
234
+ model_config = ConfigDict(
235
+ extra="allow",
236
+ )
237
+ temperature: float | None = Field(None, description="Sampling temperature")
238
+ top_p: float | None = Field(None, description="Nucleus sampling parameter")
239
+ top_k: float | None = Field(None, description="Top-k sampling parameter")
240
+ max_tokens: conint(ge=1) | None = Field(
241
+ None, description="Maximum number of tokens to generate"
242
+ )
243
+ execution_command: str | None = Field(
244
+ None, description="Command used to run the model to generate results"
245
+ )
246
+ reasoning: bool | None = Field(
247
+ None,
248
+ description="Whether reasoning orchain-of-thought was used to generate results",
249
+ )
250
+ prompt_template: str | None = Field(
251
+ None,
252
+ description="Input prompt template for task (should contain agentic info if needed).",
253
+ )
254
+ agentic_eval_config: AgenticEvalConfig | None = Field(
255
+ None, description="General configuration for agentic evaluations."
256
+ )
257
+ eval_plan: EvalPlan | None = Field(
258
+ None,
259
+ description="Plan (solvers) used in evaluation. Solvers are crucial parts of Inspect evaluations which can serve a wide variety of purposes like providing system prompts, prompt engineering, model generation or multi-turn dialog.",
260
+ )
261
+ eval_limits: EvalLimits | None = Field(
262
+ None,
263
+ description="Listed evaluation limits like time limit, message limit, token limit.",
264
+ )
265
+ sandbox: Sandbox | None = None
266
+ max_attempts: int | None = Field(
267
+ 1, description="Maximum number of submission attempts (default 1)."
268
+ )
269
+ incorrect_attempt_feedback: str | None = Field(
270
+ None, description="Feedback from the model after incorrect attempt."
271
+ )
272
+
273
+
274
+ class GenerationConfig(BaseModel):
275
+ generation_args: GenerationArgs | None = Field(
276
+ None,
277
+ description="Parameters used to generate results - properties may vary by model type",
278
+ )
279
+ additional_details: AdditionalPropertiesObject | None = None
280
+
281
+
282
+ class JudgeConfig(BaseModel):
283
+ model_info: ModelInfo
284
+ temperature: float | None = None
285
+ weight: float | None = Field(
286
+ None, description="Weight of this judge's score in aggregation (used in jury)"
287
+ )
288
+
289
+
290
+ class LlmScoring(BaseModel):
291
+ model_config = ConfigDict(
292
+ extra="allow",
293
+ )
294
+ judges: list[JudgeConfig] = Field(
295
+ ...,
296
+ description="LLM judge(s) - single item for judge, multiple for jury",
297
+ min_length=1,
298
+ )
299
+ input_prompt: str = Field(..., description="Prompt template used for judging")
300
+ aggregation_method: AggregationMethod | None = Field(
301
+ None, description="How to aggregate scores when multiple judges"
302
+ )
303
+ expert_baseline: float | None = Field(
304
+ None, description="Expert/human baseline score for comparison"
305
+ )
306
+ additional_details: AdditionalPropertiesObject | None = None
307
+
308
+
309
+ class MetricConfig(BaseModel):
310
+ evaluation_description: str | None = Field(
311
+ None, description="Description of the evaluation"
312
+ )
313
+ lower_is_better: bool = Field(..., description="Whether a lower score is better")
314
+ score_type: ScoreType | None = Field(None, description="Type of score")
315
+ level_names: list[str] | None = Field(None, description="Names of the score levels")
316
+ level_metadata: list[str] | None = Field(
317
+ None, description="Additional Description for each Score Level"
318
+ )
319
+ has_unknown_level: bool | None = Field(
320
+ None,
321
+ description="Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown",
322
+ )
323
+ min_score: float | None = Field(
324
+ None, description="Minimum possible score for continuous metric"
325
+ )
326
+ max_score: float | None = Field(
327
+ None, description="Maximum possible score for continuous metric"
328
+ )
329
+ llm_scoring: LlmScoring | None = Field(
330
+ None, description="Configuration when LLM is used as scorer/judge"
331
+ )
332
+
333
+
334
+ class EvaluationResult(BaseModel):
335
+ evaluation_name: str = Field(..., description="Name of the evaluation")
336
+ source_data: SourceDataUrl | SourceDataHf | SourceDataPrivate = Field(
337
+ ...,
338
+ description="Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.",
339
+ )
340
+ evaluation_timestamp: str | None = Field(
341
+ None, description="Timestamp for when the evaluations were run"
342
+ )
343
+ metric_config: MetricConfig = Field(..., description="Details about the metric")
344
+ score_details: ScoreDetails = Field(
345
+ ..., description="The score for the evaluation and related details"
346
+ )
347
+ generation_config: GenerationConfig | None = None
348
+
349
+
350
+ class EvaluationLog(BaseModel):
351
+ model_config = ConfigDict(
352
+ extra="forbid",
353
+ )
354
+ schema_version: str = Field(
355
+ ..., description="Version of the schema used for this evaluation data"
356
+ )
357
+ evaluation_id: str = Field(
358
+ ...,
359
+ description="Unique identifier for this specific evaluation run. Use eval_name/model_id/retrieved_timestamp format",
360
+ )
361
+ evaluation_timestamp: str | None = Field(
362
+ None, description="Timestamp for when the evaluation was run"
363
+ )
364
+ retrieved_timestamp: str = Field(
365
+ ...,
366
+ description="Timestamp for when this record was created - using Unix Epoch time format",
367
+ )
368
+ source_metadata: SourceMetadata = Field(
369
+ ..., description="Metadata about the source of the leaderboard data"
370
+ )
371
+ model_info: ModelInfo
372
+ evaluation_results: list[EvaluationResult] = Field(
373
+ ..., description="Array of evaluation results"
374
+ )
375
+ detailed_evaluation_results: DetailedEvaluationResults | None = Field(
376
+ None,
377
+ description="Reference to the evaluation results for all individual samples in the evaluation",
378
+ )
instance_level_eval.schema.json ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ { "$schema": "http://json-schema.org/draft-07/schema#",
2
+ "version": "instance_level_eval_0.2.0",
3
+ "type": "object",
4
+ "description": "Schema for storing instance-level evaluation data for LLM evaluations, supporting single-turn, multi-turn, and agentic interactions",
5
+ "required": [
6
+ "schema_version",
7
+ "evaluation_id",
8
+ "model_id",
9
+ "evaluation_name",
10
+ "sample_id",
11
+ "interaction_type",
12
+ "input",
13
+ "answer_attribution",
14
+ "evaluation"
15
+ ],
16
+ "additionalProperties": true,
17
+ "properties": {
18
+ "schema_version": {
19
+ "type": "string",
20
+ "description": "Version of the schema used for this instance data"
21
+ },
22
+ "evaluation_id": {
23
+ "type": "string",
24
+ "description": "Foreign key linking to the aggregate evaluation JSON. Must match the evaluation_id in the aggregate file."
25
+ },
26
+ "model_id": {
27
+ "type": "string",
28
+ "description": "Identifier of the model in HuggingFace format (e.g. meta-llama/Llama-3.2-1B-Instruct)"
29
+ },
30
+ "evaluation_name": {
31
+ "type": "string",
32
+ "description": "The specific eval name, ideally unique (e.g. GSM8K, mmlu_physics)"
33
+ },
34
+ "sample_id": {
35
+ "type": ["integer", "string"],
36
+ "description": "Question/sample identifier from the original dataset (e.g. gsm8k_0001)"
37
+ },
38
+ "sample_hash": {
39
+ "type": "string",
40
+ "description": "Hash of (input.raw + input.reference) to ensure comparison is between the same sample across models, in case sample_id is not consistent"
41
+ },
42
+ "interaction_type": {
43
+ "type": "string",
44
+ "enum": ["single_turn", "multi_turn", "agentic"],
45
+ "description": "Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents"
46
+ },
47
+ "input": {
48
+ "type": "object",
49
+ "description": "Input data for the evaluation sample",
50
+ "required": ["raw", "reference"],
51
+ "properties": {
52
+ "raw": {
53
+ "type": "string",
54
+ "description": "The raw input as defined in the eval"
55
+ },
56
+ "formatted": {
57
+ "type": "string",
58
+ "description": "Includes chat template, CoT and all relevant modifications - basically the un-tokenized version of what the model sees"
59
+ },
60
+ "reference": {
61
+ "type": "string",
62
+ "description": "Ground truth or reference answer for comparison/scoring"
63
+ },
64
+ "choices": {
65
+ "type": "array",
66
+ "description": "Optional list of choices for multiple-choice questions",
67
+ "items": {
68
+ "type": "string"
69
+ }
70
+ }
71
+ }
72
+ },
73
+ "output": {
74
+ "type": ["object", "null"],
75
+ "description": "Output data - only used for single_turn interactions, null for multi_turn/agentic",
76
+ "required": ["raw"],
77
+ "properties": {
78
+ "raw": {
79
+ "type": "string",
80
+ "description": "Complete model response"
81
+ },
82
+ "reasoning_trace": {
83
+ "type": ["string", "null"],
84
+ "description": "Reasoning trace of the model if applicable (e.g. chain-of-thought tokens)"
85
+ }
86
+ }
87
+ },
88
+ "interactions": {
89
+ "type": ["array", "null"],
90
+ "description": "List of interactions - used for multi_turn and agentic, null for single_turn",
91
+ "items": {
92
+ "type": "object",
93
+ "required": ["turn_idx", "role"],
94
+ "properties": {
95
+ "turn_idx": {
96
+ "type": "integer",
97
+ "minimum": 0,
98
+ "description": "Index starting from 0 indicating the position in the conversation"
99
+ },
100
+ "role": {
101
+ "type": "string",
102
+ "description": "Role of the speaker (e.g. user, assistant, system, tool)"
103
+ },
104
+ "content": {
105
+ "type": ["string", "null"],
106
+ "description": "The actual raw text for that particular turn (can be null if empty)"
107
+ },
108
+ "reasoning_trace": {
109
+ "type": ["string", "null"],
110
+ "description": "Reasoning trace for that particular turn if applicable"
111
+ },
112
+ "tool_calls": {
113
+ "type": ["array", "null"],
114
+ "description": "List of tool invocations for this turn, if applicable",
115
+ "items": {
116
+ "type": "object",
117
+ "required": ["id", "name"],
118
+ "properties": {
119
+ "id": {
120
+ "type": "string",
121
+ "description": "Unique identifier for the tool call"
122
+ },
123
+ "name": {
124
+ "type": "string",
125
+ "description": "Name of tool/function"
126
+ },
127
+ "arguments": {
128
+ "type": "object",
129
+ "description": "Arguments used to call the tool",
130
+ "additionalProperties": true
131
+ }
132
+ }
133
+ }
134
+ },
135
+ "tool_call_id": {
136
+ "oneOf": [
137
+ {
138
+ "type": "string",
139
+ "description": "Reference to the tool call ID this turn is responding to (for tool role responses)"
140
+ },
141
+ {
142
+ "type": "array",
143
+ "description": "Reference to the tool call ID(s) this message has the content payload for.",
144
+ "items": {
145
+ "type": "string"
146
+ }
147
+ }
148
+ ]
149
+ }
150
+ }
151
+ }
152
+ },
153
+ "answer_attribution": {
154
+ "type": "array",
155
+ "description": "Information about how the answer was extracted from the model output",
156
+ "items": {
157
+ "type": "object",
158
+ "required": ["turn_idx", "source", "extracted_value", "extraction_method", "is_terminal"],
159
+ "properties": {
160
+ "turn_idx": {
161
+ "type": "integer",
162
+ "minimum": 0,
163
+ "description": "Turn index in interactions. 0 for single_turn"
164
+ },
165
+ "source": {
166
+ "type": "string",
167
+ "description": "Source of the extracted value (e.g. 'output.raw' or 'interactions[turn_idx].content')"
168
+ },
169
+ "extracted_value": {
170
+ "type": "string",
171
+ "description": "Value that was extracted"
172
+ },
173
+ "extraction_method": {
174
+ "type": "string",
175
+ "description": "Method used to extract the value (e.g. regex, exact_match, llm_judge, custom)"
176
+ },
177
+ "is_terminal": {
178
+ "type": "boolean",
179
+ "description": "Whether this is the final answer (false if intermediate outputs are used to build up to a final answer)"
180
+ }
181
+ }
182
+ }
183
+ },
184
+ "evaluation": {
185
+ "type": "object",
186
+ "description": "Evaluation results and scoring data",
187
+ "required": ["score", "is_correct"],
188
+ "properties": {
189
+ "score": {
190
+ "type": ["number", "boolean"],
191
+ "description": "Instance-level score"
192
+ },
193
+ "is_correct": {
194
+ "type": "boolean",
195
+ "description": "Whether the final answer is correct"
196
+ },
197
+ "num_turns": {
198
+ "type": "integer",
199
+ "minimum": 1,
200
+ "description": "Number of turns in the interaction"
201
+ },
202
+ "tool_calls_count": {
203
+ "type": "integer",
204
+ "minimum": 0,
205
+ "description": "Count of tool calls across all turns in interactions"
206
+ }
207
+ }
208
+ },
209
+ "token_usage": {
210
+ "type": ["object", "null"],
211
+ "description": "Token usage for the model completion",
212
+ "required": ["input_tokens", "output_tokens", "total_tokens"],
213
+ "properties": {
214
+ "input_tokens": {
215
+ "type": "integer",
216
+ "minimum": 0,
217
+ "description": "Total input tokens used"
218
+ },
219
+ "output_tokens": {
220
+ "type": "integer",
221
+ "minimum": 0,
222
+ "description": "Total output tokens used"
223
+ },
224
+ "total_tokens": {
225
+ "type": "integer",
226
+ "minimum": 0,
227
+ "description": "Total tokens used"
228
+ },
229
+ "input_tokens_cache_write": {
230
+ "type": ["integer", "null"],
231
+ "minimum": 0,
232
+ "description": "Number of tokens written to the cache"
233
+ },
234
+ "input_tokens_cache_read": {
235
+ "type": ["integer", "null"],
236
+ "minimum": 0,
237
+ "description": "Number of tokens retrieved from the cache"
238
+ },
239
+ "reasoning_tokens": {
240
+ "type": ["integer", "null"],
241
+ "minimum": 0,
242
+ "description": "Number of tokens used for reasoning"
243
+ }
244
+ }
245
+ },
246
+ "performance": {
247
+ "type": ["object", "null"],
248
+ "description": "Performance and latency metrics",
249
+ "properties": {
250
+ "latency_ms": {
251
+ "type": ["number", "null"],
252
+ "minimum": 0,
253
+ "description": "Total latency in milliseconds"
254
+ },
255
+ "time_to_first_token_ms": {
256
+ "type": ["number", "null"],
257
+ "minimum": 0,
258
+ "description": "Time to first token in milliseconds"
259
+ },
260
+ "generation_time_ms": {
261
+ "type": ["number", "null"],
262
+ "minimum": 0,
263
+ "description": "Time for generation in milliseconds"
264
+ }
265
+ },
266
+ "additionalProperties": true
267
+ },
268
+ "error": {
269
+ "type": ["string", "null"],
270
+ "description": "Information about any error that occurred (e.g. timeout, refusal, API error)"
271
+ },
272
+ "metadata": {
273
+ "type": "object",
274
+ "description": "Optional metadata about the sample (e.g. subject, difficulty, tags)",
275
+ "additionalProperties": true
276
+ }
277
+ },
278
+ "allOf": [
279
+ {
280
+ "if": {
281
+ "properties": {
282
+ "interaction_type": {
283
+ "const": "single_turn"
284
+ }
285
+ }
286
+ },
287
+ "then": {
288
+ "required": ["output"],
289
+ "properties": {
290
+ "output": {
291
+ "type": "object",
292
+ "not": {
293
+ "type": "null"
294
+ }
295
+ },
296
+ "interactions": {
297
+ "type": "null"
298
+ }
299
+ }
300
+ }
301
+ },
302
+ {
303
+ "if": {
304
+ "properties": {
305
+ "interaction_type": {
306
+ "enum": ["multi_turn", "agentic"]
307
+ }
308
+ }
309
+ },
310
+ "then": {
311
+ "required": ["interactions"],
312
+ "properties": {
313
+ "output": {
314
+ "type": "null"
315
+ },
316
+ "interactions": {
317
+ "type": "array",
318
+ "not": {
319
+ "type": "null"
320
+ }
321
+ },
322
+ "metrics": {
323
+ "required": ["num_turns"]
324
+ }
325
+ }
326
+ }
327
+ }
328
+ ]
329
+ }
instance_level_types.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generated by datamodel-codegen:
2
+ # filename: instance_level_eval.schema.json
3
+ # timestamp: 2026-02-11T15:00:06+00:00
4
+
5
+ from __future__ import annotations
6
+ from enum import Enum
7
+ from pydantic import BaseModel, ConfigDict, Field, confloat, conint
8
+ from typing import Any
9
+
10
+
11
+ class InteractionType(Enum):
12
+ single_turn = "single_turn"
13
+ multi_turn = "multi_turn"
14
+ agentic = "agentic"
15
+
16
+
17
+ class Input(BaseModel):
18
+ raw: str = Field(..., description="The raw input as defined in the eval")
19
+ formatted: str | None = Field(
20
+ None,
21
+ description="Includes chat template, CoT and all relevant modifications - basically the un-tokenized version of what the model sees",
22
+ )
23
+ reference: str = Field(
24
+ ..., description="Ground truth or reference answer for comparison/scoring"
25
+ )
26
+ choices: list[str] | None = Field(
27
+ None, description="Optional list of choices for multiple-choice questions"
28
+ )
29
+
30
+
31
+ class Output(BaseModel):
32
+ raw: str = Field(..., description="Complete model response")
33
+ reasoning_trace: str | None = Field(
34
+ None,
35
+ description="Reasoning trace of the model if applicable (e.g. chain-of-thought tokens)",
36
+ )
37
+
38
+
39
+ class ToolCall(BaseModel):
40
+ id: str = Field(..., description="Unique identifier for the tool call")
41
+ name: str = Field(..., description="Name of tool/function")
42
+ arguments: dict[str, Any] | None = Field(
43
+ None, description="Arguments used to call the tool"
44
+ )
45
+
46
+
47
+ class Interaction(BaseModel):
48
+ turn_idx: conint(ge=0) = Field(
49
+ ...,
50
+ description="Index starting from 0 indicating the position in the conversation",
51
+ )
52
+ role: str = Field(
53
+ ..., description="Role of the speaker (e.g. user, assistant, system, tool)"
54
+ )
55
+ content: str | None = Field(
56
+ None,
57
+ description="The actual raw text for that particular turn (can be null if empty)",
58
+ )
59
+ reasoning_trace: str | None = Field(
60
+ None, description="Reasoning trace for that particular turn if applicable"
61
+ )
62
+ tool_calls: list[ToolCall] | None = Field(
63
+ None, description="List of tool invocations for this turn, if applicable"
64
+ )
65
+ tool_call_id: str | list[str] | None = None
66
+
67
+
68
+ class AnswerAttributionItem(BaseModel):
69
+ turn_idx: conint(ge=0) = Field(
70
+ ..., description="Turn index in interactions. 0 for single_turn"
71
+ )
72
+ source: str = Field(
73
+ ...,
74
+ description="Source of the extracted value (e.g. 'output.raw' or 'interactions[turn_idx].content')",
75
+ )
76
+ extracted_value: str = Field(..., description="Value that was extracted")
77
+ extraction_method: str = Field(
78
+ ...,
79
+ description="Method used to extract the value (e.g. regex, exact_match, llm_judge, custom)",
80
+ )
81
+ is_terminal: bool = Field(
82
+ ...,
83
+ description="Whether this is the final answer (false if intermediate outputs are used to build up to a final answer)",
84
+ )
85
+
86
+
87
+ class Evaluation(BaseModel):
88
+ score: float | bool = Field(..., description="Instance-level score")
89
+ is_correct: bool = Field(..., description="Whether the final answer is correct")
90
+ num_turns: conint(ge=1) | None = Field(
91
+ None, description="Number of turns in the interaction"
92
+ )
93
+ tool_calls_count: conint(ge=0) | None = Field(
94
+ None, description="Count of tool calls across all turns in interactions"
95
+ )
96
+
97
+
98
+ class TokenUsage(BaseModel):
99
+ input_tokens: conint(ge=0) = Field(..., description="Total input tokens used")
100
+ output_tokens: conint(ge=0) = Field(..., description="Total output tokens used")
101
+ total_tokens: conint(ge=0) = Field(..., description="Total tokens used")
102
+ input_tokens_cache_write: conint(ge=0) | None = Field(
103
+ None, description="Number of tokens written to the cache"
104
+ )
105
+ input_tokens_cache_read: conint(ge=0) | None = Field(
106
+ None, description="Number of tokens retrieved from the cache"
107
+ )
108
+ reasoning_tokens: conint(ge=0) | None = Field(
109
+ None, description="Number of tokens used for reasoning"
110
+ )
111
+
112
+
113
+ class Performance(BaseModel):
114
+ model_config = ConfigDict(
115
+ extra="allow",
116
+ )
117
+ latency_ms: confloat(ge=0.0) | None = Field(
118
+ None, description="Total latency in milliseconds"
119
+ )
120
+ time_to_first_token_ms: confloat(ge=0.0) | None = Field(
121
+ None, description="Time to first token in milliseconds"
122
+ )
123
+ generation_time_ms: confloat(ge=0.0) | None = Field(
124
+ None, description="Time for generation in milliseconds"
125
+ )
126
+
127
+
128
+ class InstanceLevelEvaluationLog(BaseModel):
129
+ model_config = ConfigDict(
130
+ extra="allow",
131
+ )
132
+ schema_version: str = Field(
133
+ ..., description="Version of the schema used for this instance data"
134
+ )
135
+ evaluation_id: str = Field(
136
+ ...,
137
+ description="Foreign key linking to the aggregate evaluation JSON. Must match the evaluation_id in the aggregate file.",
138
+ )
139
+ model_id: str = Field(
140
+ ...,
141
+ description="Identifier of the model in HuggingFace format (e.g. meta-llama/Llama-3.2-1B-Instruct)",
142
+ )
143
+ evaluation_name: str = Field(
144
+ ...,
145
+ description="The specific eval name, ideally unique (e.g. GSM8K, mmlu_physics)",
146
+ )
147
+ sample_id: int | str = Field(
148
+ ...,
149
+ description="Question/sample identifier from the original dataset (e.g. gsm8k_0001)",
150
+ )
151
+ sample_hash: str | None = Field(
152
+ None,
153
+ description="Hash of (input.raw + input.reference) to ensure comparison is between the same sample across models, in case sample_id is not consistent",
154
+ )
155
+ interaction_type: InteractionType = Field(
156
+ ...,
157
+ description="Type of interaction: single_turn for simple Q&A, multi_turn for conversations, agentic for tool-using agents",
158
+ )
159
+ input: Input = Field(..., description="Input data for the evaluation sample")
160
+ output: Output | None = Field(
161
+ None,
162
+ description="Output data - only used for single_turn interactions, null for multi_turn/agentic",
163
+ )
164
+ interactions: list[Interaction] | None = Field(
165
+ None,
166
+ description="List of interactions - used for multi_turn and agentic, null for single_turn",
167
+ )
168
+ answer_attribution: list[AnswerAttributionItem] = Field(
169
+ ...,
170
+ description="Information about how the answer was extracted from the model output",
171
+ )
172
+ evaluation: Evaluation = Field(
173
+ ..., description="Evaluation results and scoring data"
174
+ )
175
+ token_usage: TokenUsage | None = Field(
176
+ None, description="Token usage for the model completion"
177
+ )
178
+ performance: Performance | None = Field(
179
+ None, description="Performance and latency metrics"
180
+ )
181
+ error: str | None = Field(
182
+ None,
183
+ description="Information about any error that occurred (e.g. timeout, refusal, API error)",
184
+ )
185
+ metadata: dict[str, Any] | None = Field(
186
+ None,
187
+ description="Optional metadata about the sample (e.g. subject, difficulty, tags)",
188
+ )
pyproject.toml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "eee-validator"
3
+ version = "0.1.0"
4
+ description = "CI validation proxy for the EEE datastore on HuggingFace"
5
+ requires-python = ">=3.12"
6
+ dependencies = [
7
+ "huggingface-hub>=0.27.0",
8
+ "pydantic>=2.0",
9
+ "gradio>=5.0",
10
+ "jsonschema>=4.0",
11
+ ]
12
+
13
+ [tool.hatch.build.targets.wheel]
14
+ packages = ["."]
15
+
16
+ [build-system]
17
+ requires = ["hatchling"]
18
+ build-backend = "hatchling.build"
requirement_plan.txt ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Our problem is as follows - we need to move our individual jsons + jsonls to HuggingFace as its just a better structure + we might hit storage limits etc., The process for new data to be submitted would be via drag and drop to file-upload interface of huggingface datasets which is for all purposes just git.
3
+
4
+ Aspirationally, the validation + de-duplication workflow is able to:
5
+ (1) Detect changes (i.e. data added only during the PR)
6
+ (2) Runs de-duplication w.r.t the PR and the existing datastore
7
+ (3) Runs the validation only for the added data (one approach here would use git diff) and
8
+ (4) Adds them back to the datastore
9
+
10
+ The huggingface dataset -> https://huggingface.co/datasets/evaleval/EEE_datastore (evaleval/EEE_datastore)
11
+
12
+ Both this repo and the dataset can be managed by git.
13
+
14
+ - repo structure
15
+ data/
16
+ ├── {eval_name}/
17
+ │ └── {developer_name}/
18
+ │ └── {model_name}/
19
+ │ └── {uuid}.json
20
+ └── {uuid}_samples.jsonl
21
+ validate_data.py
22
+ eval_types.py
23
+ instance_level_types.py
24
+
25
+ There are typically two types: aggregate information as json and samples with _instances.jsonl (having jsonl in the name) as instance level types that need to be validated.
26
+
27
+ The data will be added to by users using the upload functionality which will open a PR.
28
+
29
+ First request: build Dockerfile with uv in which the validation will be run, get all dependencies in a requirements.txt that can be uv add -r'd or something.
30
+
31
+ Regarding the workflow implement the following:
32
+
33
+ (1) Detect changes and pull to space
34
+ - When a user/external collaborator opens a PR in the HF dataset, we wake the space or trigger the space via webhook or better procedure. Following this, we use git diff (or something better if you can recommend) on the huggingface space to find which files have been added (or modified).
35
+
36
+ Then download only the stuff that has changed using huggingface_hub api for download certain files from the dataset or some fine-grained git stuff.
37
+
38
+ Following this, run validate_py against the schema or use eval_types/instance_eval_types for validation with pydantic whichever you think is more robust and efficient.
39
+
40
+ (2) Maintain a manifest containing some form of sha256 hashes, and then compute the new hashes for the whole JSON and the compare to the manifest and if there's a near collision (or 99% or identical) write a .txt or .md (whichever is easier) that flags potential duplicates.
41
+
42
+ (3) Write back to the text which has the changes/upload/information (with some sort of unique name always - that eveyrhting was validated or if something failed which files).
43
+
44
+ Main thing is I want all of this needs to run in the space (as a proxy for CI) and then update only the necessary data (then clear the space of the rest of the data)
uv.lock ADDED
The diff for this file is too large to render. See raw diff
 
validate_data.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ from dataclasses import dataclass, field
5
+ from typing import List
6
+
7
+ from jsonschema.exceptions import ValidationError
8
+ from jsonschema.protocols import Validator
9
+ from jsonschema.validators import validator_for
10
+ from pydantic import ValidationError as PydanticValidationError
11
+
12
+ from eval_types import EvaluationLog
13
+ from instance_level_types import InstanceLevelEvaluationLog
14
+
15
+
16
+ @dataclass
17
+ class FileValidationResult:
18
+ """Result of validating a single file."""
19
+ file_path: str
20
+ valid: bool
21
+ file_type: str # "json" or "jsonl"
22
+ errors: list[str] = field(default_factory=list)
23
+
24
+
25
+ def validate_with_pydantic(file_path: str, file_type: str) -> FileValidationResult:
26
+ """Validate a file using Pydantic models.
27
+
28
+ Args:
29
+ file_path: Path to the file on disk.
30
+ file_type: Either "json" or "jsonl".
31
+
32
+ Returns:
33
+ FileValidationResult with validation outcome and any errors.
34
+ """
35
+ result = FileValidationResult(file_path=file_path, valid=True, file_type=file_type)
36
+
37
+ if file_type == "json":
38
+ try:
39
+ with open(file_path, "r") as f:
40
+ data = json.load(f)
41
+ EvaluationLog(**data)
42
+ except json.JSONDecodeError as e:
43
+ result.valid = False
44
+ result.errors.append(f"JSON parse error: {e}")
45
+ except PydanticValidationError as e:
46
+ result.valid = False
47
+ for err in e.errors():
48
+ loc = " -> ".join(str(l) for l in err["loc"])
49
+ result.errors.append(f"{loc}: {err['msg']}")
50
+ except Exception as e:
51
+ result.valid = False
52
+ result.errors.append(f"{type(e).__name__}: {e}")
53
+
54
+ elif file_type == "jsonl":
55
+ try:
56
+ with open(file_path, "r") as f:
57
+ lines = f.readlines()
58
+ except Exception as e:
59
+ result.valid = False
60
+ result.errors.append(f"File read error: {e}")
61
+ return result
62
+
63
+ for line_num, line in enumerate(lines, start=1):
64
+ line = line.strip()
65
+ if not line:
66
+ continue
67
+ try:
68
+ data = json.loads(line)
69
+ InstanceLevelEvaluationLog(**data)
70
+ except json.JSONDecodeError as e:
71
+ result.valid = False
72
+ result.errors.append(f"Line {line_num}: JSON parse error: {e}")
73
+ except PydanticValidationError as e:
74
+ result.valid = False
75
+ for err in e.errors():
76
+ loc = " -> ".join(str(l) for l in err["loc"])
77
+ result.errors.append(f"Line {line_num}: {loc}: {err['msg']}")
78
+ except Exception as e:
79
+ result.valid = False
80
+ result.errors.append(f"Line {line_num}: {type(e).__name__}: {e}")
81
+ else:
82
+ result.valid = False
83
+ result.errors.append(f"Unsupported file type: {file_type}")
84
+
85
+ return result
86
+
87
+
88
+ def get_schema_validator(file_path: str) -> Validator:
89
+ with open(file_path, "r") as f:
90
+ schema = json.load(f)
91
+ validator_cls = validator_for(schema)
92
+ return validator_cls(schema)
93
+
94
+
95
+ def validate_file(file_path: str, validator: Validator) -> None:
96
+ with open(file_path, "r") as f:
97
+ instance = json.load(f)
98
+ validator.validate(instance)
99
+
100
+
101
+ def expand_paths(paths: List[str]) -> List[str]:
102
+ """Expand folders to file paths"""
103
+ file_paths: List[str] = []
104
+ for path in paths:
105
+ if os.path.isfile(path) and path.endswith(".json"):
106
+ file_paths.append(path)
107
+ elif os.path.isdir(path):
108
+ for root, _, file_names in os.walk(path):
109
+ for file_name in file_names:
110
+ if file_name.endswith(".json"):
111
+ file_paths.append(os.path.join(root, file_name))
112
+ else:
113
+ raise Exception(f"Could not find file or directory at path: {path}")
114
+ return file_paths
115
+
116
+
117
+ def annotate_error(file_path: str, message: str, **kwargs) -> None:
118
+ """If run in GitHub Actions, annotate errors"""
119
+ if os.environ.get("GITHUB_ACTION"):
120
+ joined_kwargs = "".join(f",{key}={value}" for key, value in kwargs.items())
121
+ print(f"::error file={file_path}{joined_kwargs}::{message}")
122
+
123
+
124
+ def main() -> None:
125
+ parser = argparse.ArgumentParser(
126
+ prog="validate_data",
127
+ description="Validates that the JSON data conforms to the JSON schema",
128
+ )
129
+ parser.add_argument(
130
+ "paths", nargs="+", type=str, help="File or folder paths to the JSON data"
131
+ )
132
+ parser.add_argument(
133
+ "-s",
134
+ "--schema-path",
135
+ type=str,
136
+ help="File path to the JSON schema",
137
+ required=True,
138
+ )
139
+ args = parser.parse_args()
140
+ file_paths = expand_paths(args.paths)
141
+ num_passed = 0
142
+ num_failed = 0
143
+ validator = get_schema_validator(args.schema_path)
144
+ print()
145
+ print(f"Validating {len(file_paths)} JSON files...")
146
+ print()
147
+ for file_path in file_paths:
148
+ try:
149
+ validate_file(file_path, validator)
150
+ num_passed += 1
151
+ except ValidationError as e:
152
+ message = f"{type(e).__name__}: {e.message}"
153
+ annotate_error(
154
+ file_path, f"{type(e).__name__}: {e.message}", title=type(e).__name__
155
+ )
156
+ print(f"{file_path}")
157
+ print(" " + message)
158
+ print()
159
+ num_failed += 1
160
+ except json.JSONDecodeError as e:
161
+ # e.colno
162
+ message = f"{type(e).__name__}: {str(e)}"
163
+ annotate_error(
164
+ file_path,
165
+ f"{type(e).__name__}: {str(e)}",
166
+ title=type(e).__name__,
167
+ col=e.colno,
168
+ line=e.lineno,
169
+ )
170
+ print(f"{file_path}")
171
+ print(" " + message)
172
+ print()
173
+ num_failed += 1
174
+ except Exception as e:
175
+ message = f"{type(e).__name__}: {str(e)}"
176
+ annotate_error(
177
+ file_path, f"{type(e).__name__}: {str(e)}", title=type(e).__name__
178
+ )
179
+ print(f"{file_path}")
180
+ print(" " + message)
181
+ print()
182
+ raise
183
+ print(f"{num_passed} file(s) passed; {num_failed} file(s) failed")
184
+ print()
185
+ if num_failed > 0:
186
+ exit(1)
187
+
188
+
189
+ if __name__ == "__main__":
190
+ main()