Spaces:
Running
Running
File size: 6,310 Bytes
92ea780 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import argparse
import json
import os
from dataclasses import dataclass, field
from typing import List
from jsonschema.exceptions import ValidationError
from jsonschema.protocols import Validator
from jsonschema.validators import validator_for
from pydantic import ValidationError as PydanticValidationError
from eval_types import EvaluationLog
from instance_level_types import InstanceLevelEvaluationLog
@dataclass
class FileValidationResult:
"""Result of validating a single file."""
file_path: str
valid: bool
file_type: str # "json" or "jsonl"
errors: list[str] = field(default_factory=list)
def validate_with_pydantic(file_path: str, file_type: str) -> FileValidationResult:
"""Validate a file using Pydantic models.
Args:
file_path: Path to the file on disk.
file_type: Either "json" or "jsonl".
Returns:
FileValidationResult with validation outcome and any errors.
"""
result = FileValidationResult(file_path=file_path, valid=True, file_type=file_type)
if file_type == "json":
try:
with open(file_path, "r") as f:
data = json.load(f)
EvaluationLog(**data)
except json.JSONDecodeError as e:
result.valid = False
result.errors.append(f"JSON parse error: {e}")
except PydanticValidationError as e:
result.valid = False
for err in e.errors():
loc = " -> ".join(str(l) for l in err["loc"])
result.errors.append(f"{loc}: {err['msg']}")
except Exception as e:
result.valid = False
result.errors.append(f"{type(e).__name__}: {e}")
elif file_type == "jsonl":
try:
with open(file_path, "r") as f:
lines = f.readlines()
except Exception as e:
result.valid = False
result.errors.append(f"File read error: {e}")
return result
for line_num, line in enumerate(lines, start=1):
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
InstanceLevelEvaluationLog(**data)
except json.JSONDecodeError as e:
result.valid = False
result.errors.append(f"Line {line_num}: JSON parse error: {e}")
except PydanticValidationError as e:
result.valid = False
for err in e.errors():
loc = " -> ".join(str(l) for l in err["loc"])
result.errors.append(f"Line {line_num}: {loc}: {err['msg']}")
except Exception as e:
result.valid = False
result.errors.append(f"Line {line_num}: {type(e).__name__}: {e}")
else:
result.valid = False
result.errors.append(f"Unsupported file type: {file_type}")
return result
def get_schema_validator(file_path: str) -> Validator:
with open(file_path, "r") as f:
schema = json.load(f)
validator_cls = validator_for(schema)
return validator_cls(schema)
def validate_file(file_path: str, validator: Validator) -> None:
with open(file_path, "r") as f:
instance = json.load(f)
validator.validate(instance)
def expand_paths(paths: List[str]) -> List[str]:
"""Expand folders to file paths"""
file_paths: List[str] = []
for path in paths:
if os.path.isfile(path) and path.endswith(".json"):
file_paths.append(path)
elif os.path.isdir(path):
for root, _, file_names in os.walk(path):
for file_name in file_names:
if file_name.endswith(".json"):
file_paths.append(os.path.join(root, file_name))
else:
raise Exception(f"Could not find file or directory at path: {path}")
return file_paths
def annotate_error(file_path: str, message: str, **kwargs) -> None:
"""If run in GitHub Actions, annotate errors"""
if os.environ.get("GITHUB_ACTION"):
joined_kwargs = "".join(f",{key}={value}" for key, value in kwargs.items())
print(f"::error file={file_path}{joined_kwargs}::{message}")
def main() -> None:
parser = argparse.ArgumentParser(
prog="validate_data",
description="Validates that the JSON data conforms to the JSON schema",
)
parser.add_argument(
"paths", nargs="+", type=str, help="File or folder paths to the JSON data"
)
parser.add_argument(
"-s",
"--schema-path",
type=str,
help="File path to the JSON schema",
required=True,
)
args = parser.parse_args()
file_paths = expand_paths(args.paths)
num_passed = 0
num_failed = 0
validator = get_schema_validator(args.schema_path)
print()
print(f"Validating {len(file_paths)} JSON files...")
print()
for file_path in file_paths:
try:
validate_file(file_path, validator)
num_passed += 1
except ValidationError as e:
message = f"{type(e).__name__}: {e.message}"
annotate_error(
file_path, f"{type(e).__name__}: {e.message}", title=type(e).__name__
)
print(f"{file_path}")
print(" " + message)
print()
num_failed += 1
except json.JSONDecodeError as e:
# e.colno
message = f"{type(e).__name__}: {str(e)}"
annotate_error(
file_path,
f"{type(e).__name__}: {str(e)}",
title=type(e).__name__,
col=e.colno,
line=e.lineno,
)
print(f"{file_path}")
print(" " + message)
print()
num_failed += 1
except Exception as e:
message = f"{type(e).__name__}: {str(e)}"
annotate_error(
file_path, f"{type(e).__name__}: {str(e)}", title=type(e).__name__
)
print(f"{file_path}")
print(" " + message)
print()
raise
print(f"{num_passed} file(s) passed; {num_failed} file(s) failed")
print()
if num_failed > 0:
exit(1)
if __name__ == "__main__":
main()
|