eee_validator / eval.schema.json
deepmage121's picture
initial commit, space validation stuff
92ea780
{
"$schema": "http://json-schema.org/draft-07/schema#",
"version": "0.2.0",
"type": "object",
"description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
"required": [
"schema_version",
"evaluation_id",
"retrieved_timestamp",
"source_metadata",
"model_info",
"evaluation_results"
],
"additionalProperties": false,
"properties": {
"schema_version": {
"type": "string",
"description": "Version of the schema used for this evaluation data"
},
"evaluation_id": {
"type": "string",
"description": "Unique identifier for this specific evaluation run. Use eval_name/model_id/retrieved_timestamp format"
},
"evaluation_timestamp": {
"type": "string",
"description": "Timestamp for when the evaluation was run"
},
"retrieved_timestamp": {
"type": "string",
"description": "Timestamp for when this record was created - using Unix Epoch time format"
},
"source_metadata": {
"type": "object",
"description": "Metadata about the source of the leaderboard data",
"required": [
"source_type",
"source_organization_name",
"evaluator_relationship"
],
"properties": {
"source_name": {
"type": "string",
"description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation)."
},
"source_type": {
"type": "string",
"enum": [
"documentation",
"evaluation_run"
],
"description": "Whether the data comes from a direct evaluation run or from documentation"
},
"source_organization_name": {
"type": "string",
"description": "Name of the organization that provides the data"
},
"source_organization_url": {
"type": "string",
"description": "URL for the organization that provides the data"
},
"source_organization_logo_url": {
"type": "string",
"description": "URL for the Logo for the organization that provides the data"
},
"evaluator_relationship": {
"type": "string",
"description": "Relationship between the evaluator and the model",
"enum": [
"first_party",
"third_party",
"collaborative",
"other"
]
}
}
},
"model_info": {
"$ref": "#/$defs/model_info"
},
"evaluation_results": {
"type": "array",
"description": "Array of evaluation results",
"items": {
"type": "object",
"required": [
"evaluation_name",
"source_data",
"metric_config",
"score_details"
],
"properties": {
"evaluation_name": {
"type": "string",
"description": "Name of the evaluation"
},
"source_data": {
"description": "Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.",
"oneOf": [
{
"$ref": "#/$defs/source_data_url"
},
{
"$ref": "#/$defs/source_data_hf"
},
{
"$ref": "#/$defs/source_data_private"
}
]
},
"evaluation_timestamp": {
"type": "string",
"description": "Timestamp for when the evaluations were run"
},
"metric_config": {
"type": "object",
"description": "Details about the metric",
"required": [
"lower_is_better"
],
"properties": {
"evaluation_description": {
"type": "string",
"description": "Description of the evaluation"
},
"lower_is_better": {
"type": "boolean",
"description": "Whether a lower score is better"
},
"score_type": {
"type": "string",
"description": "Type of score",
"enum": [
"binary",
"continuous",
"levels"
]
},
"level_names": {
"type": "array",
"description": "Names of the score levels",
"items": {
"type": "string"
}
},
"level_metadata": {
"type": "array",
"description": "Additional Description for each Score Level",
"items": {
"type": "string"
}
},
"has_unknown_level": {
"type": "boolean",
"description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown"
},
"min_score": {
"type": "number",
"description": "Minimum possible score for continuous metric"
},
"max_score": {
"type": "number",
"description": "Maximum possible score for continuous metric"
},
"llm_scoring": {
"type": "object",
"description": "Configuration when LLM is used as scorer/judge",
"additionalProperties": true,
"required": [
"judges",
"input_prompt"
],
"properties": {
"judges": {
"type": "array",
"description": "LLM judge(s) - single item for judge, multiple for jury",
"items": {
"$ref": "#/$defs/judge_config"
},
"minItems": 1
},
"input_prompt": {
"type": "string",
"description": "Prompt template used for judging"
},
"aggregation_method": {
"type": "string",
"enum": [
"majority_vote",
"average",
"weighted_average",
"median"
],
"description": "How to aggregate scores when multiple judges"
},
"expert_baseline": {
"type": "number",
"description": "Expert/human baseline score for comparison"
},
"additional_details": {
"$ref": "#/$defs/additional_properties_object"
}
}
}
},
"if": {
"properties": {
"score_type": {
"const": "levels"
}
}
},
"then": {
"required": [
"level_names",
"has_unknown_level"
]
},
"else": {
"if": {
"properties": {
"score_type": {
"const": "continuous"
}
}
},
"then": {
"required": [
"min_score",
"max_score"
]
}
}
},
"score_details": {
"type" : "object",
"description": "The score for the evaluation and related details",
"required": [
"score"
],
"properties": {
"score": {
"type": "number",
"description": "The score for the evaluation"
},
"details": {
"$ref": "#/$defs/additional_properties_object"
},
"uncertainty": {
"type": "object",
"description": "Quantification of uncertainty around the reported score",
"properties": {
"standard_error": {
"type": "object",
"description": "Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))",
"properties": {
"value": {
"type": "number",
"description": "The standard error value"
},
"method": {
"type": "string",
"description": "How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')"
}
},
"required": ["value"]
},
"confidence_interval": {
"type": "object",
"description": "Lower and upper bounds for the metric at a given confidence level.",
"properties": {
"lower": {
"type": "number",
"description": "Lower bound of the confidence interval"
},
"upper": {
"type": "number",
"description": "Upper bound of the confidence interval"
},
"confidence_level": {
"type": "number",
"description": "Confidence level (e.g. 0.95 for a 95% confidence interval)",
"minimum": 0,
"maximum": 1
},
"method": {
"type": "string",
"description": "How the confidence interval was computed"
}
},
"required": ["lower", "upper"]
},
"standard_deviation": {
"type": "number",
"description": "Standard deviation of the per-sample scores"
},
"num_samples": {
"type": "integer",
"description": "Number of samples used to compute the uncertainty estimates"
},
"num_bootstrap_samples": {
"type": "integer",
"description": "Number of bootstrap resamples used, if bootstrap method was applied"
}
}
}
}
},
"generation_config": {
"type": "object",
"properties": {
"generation_args": {
"type": "object",
"description": "Parameters used to generate results - properties may vary by model type",
"properties": {
"temperature": {
"type": [
"null",
"number"
],
"description": "Sampling temperature"
},
"top_p": {
"type": [
"null",
"number"
],
"description": "Nucleus sampling parameter"
},
"top_k": {
"type": [
"null",
"number"
],
"description": "Top-k sampling parameter"
},
"max_tokens": {
"type": "integer",
"minimum": 1,
"description": "Maximum number of tokens to generate"
},
"execution_command": {
"type": "string",
"description": "Command used to run the model to generate results"
},
"reasoning": {
"type": "boolean",
"description": "Whether reasoning orchain-of-thought was used to generate results"
},
"prompt_template": {
"type": "string",
"description": "Input prompt template for task (should contain agentic info if needed)."
},
"agentic_eval_config": {
"type": "object",
"description": "General configuration for agentic evaluations.",
"properties": {
"available_tools": {
"type": "array",
"description": "List of all available tools with their configurations",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "e.g. bash, calculator, ..."
},
"description": {
"type": "string"
},
"parameters": {
"$ref": "#/$defs/additional_properties_object"
}
}
}
},
"additional_details": {
"$ref": "#/$defs/additional_properties_object"
}
}
},
"eval_plan": {
"type": "object",
"description": "Plan (solvers) used in evaluation. Solvers are crucial parts of Inspect evaluations which can serve a wide variety of purposes like providing system prompts, prompt engineering, model generation or multi-turn dialog.",
"properties": {
"name": {
"type": "string"
},
"steps": {
"type": "array",
"description": "Array of evaluation plan steps",
"items": {
"solver": {
"type": "string",
"description": "Name of solver e.g. system_message, react."
},
"parameters": {
"$ref": "#/$defs/additional_properties_object"
}
}
},
"config": {
"$ref": "#/$defs/additional_properties_object"
}
}
},
"eval_limits": {
"type": "object",
"description": "Listed evaluation limits like time limit, message limit, token limit.",
"properties": {
"time_limit": {
"type": "integer",
"description": "Time limit for evaluation."
},
"message_limit": {
"type": "integer",
"description": "Message limit for evaluation."
},
"token_limit": {
"type": "integer",
"description": "Token limit for evaluation."
}
}
},
"sandbox": {
"type": "object",
"properties": {
"type": {
"type": "string",
"description": "Type of sandbox e.g. docker"
},
"config": {
"type": "string",
"description": "Config file name/path e.g. compose.yaml. TODO or full config? Not sure based on the Inspect docs"
}
}
},
"max_attempts": {
"type": "integer",
"description": "Maximum number of submission attempts (default 1).",
"default": 1
},
"incorrect_attempt_feedback": {
"type": "string",
"description": "Feedback from the model after incorrect attempt."
}
},
"additionalProperties": true
},
"additional_details": {
"$ref": "#/$defs/additional_properties_object"
}
}
}
}
}
},
"detailed_evaluation_results": {
"description": "Reference to the evaluation results for all individual samples in the evaluation",
"properties": {
"format": {
"type": "string",
"description": "Format of the detailed evaluation results",
"enum": [
"jsonl",
"json"
]
},
"file_path": {
"type": "string",
"description": "Path to the detailed evaluation results file"
},
"hash_algorithm": {
"type": "string",
"description": "Hash algorithm used for checksum and sample_hash in instance-level data",
"enum": [
"sha256",
"md5"
]
},
"checksum": {
"type": "string",
"description": "Checksum value of the file"
},
"total_rows": {
"type": "integer",
"description": "Total number of rows in the detailed evaluation results file"
}
}
}
},
"$defs": {
"additional_properties_object": {
"type": "object",
"description": "Additional parameters (key-value object)",
"additionalProperties": true
},
"judge_config": {
"type": "object",
"description": "Configuration for a single LLM judge/juror",
"required": [
"model_info"
],
"properties": {
"model_info": {
"$ref": "#/$defs/model_info"
},
"temperature": {
"type": "number"
},
"weight": {
"type": "number",
"description": "Weight of this judge's score in aggregation (used in jury)"
}
}
},
"model_info": {
"type": "object",
"description": "Complete model specification including basic information, technical configuration and inference settings",
"required": [
"name",
"id"
],
"properties": {
"name": {
"type": "string",
"description": "Model name provided by evaluation source"
},
"id": {
"type": "string",
"description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)"
},
"developer": {
"type": "string",
"description": "Name of organization that provides the model (e.g. 'OpenAI')"
},
"inference_platform": {
"type": "string",
"description": "Name of inference platform which provides an access to models by API to run the evaluations or provides models weights to run them locally (e.g. HuggingFace, Bedrock, Together AI)"
},
"inference_engine": {
"type": "object",
"description": "Name of inference engine which provides an access to optimized models to use them for local evaluations (e.g. vLLM, Ollama).",
"properties": {
"name": {
"type": "string",
"description": "Name of the inference engine"
},
"version": {
"type": "string",
"description": "Version of the inference engine"
}
}
},
"additional_details": {
"$ref": "#/$defs/additional_properties_object"
}
}
},
"source_data_url": {
"type": "object",
"description": "URL source for the evaluation data",
"required": [
"dataset_name",
"source_type",
"url"
],
"additionalProperties": true,
"properties": {
"dataset_name": {
"type": "string",
"description": "Name of the source dataset"
},
"source_type": {
"const": "url"
},
"url": {
"type": "array",
"items": {
"type": "string"
},
"minItems": 1,
"description": "URL(s) for the source of the evaluation data"
},
"additional_details": {
"$ref": "#/$defs/additional_properties_object"
}
}
},
"source_data_hf": {
"type": "object",
"description": "Details about HuggingFace dataset used for evaluation",
"required": [
"dataset_name",
"source_type"
],
"additionalProperties": true,
"properties": {
"dataset_name": {
"type": "string",
"description": "Name of the source dataset"
},
"source_type": {
"const": "hf_dataset"
},
"hf_repo": {
"type": "string",
"description": "HuggingFace repository identifier"
},
"hf_split": {
"type": "string",
"description": "One of train, val or test."
},
"samples_number": {
"type": "integer",
"description": "Number of samples in the dataset"
},
"sample_ids": {
"type": "array",
"description": "Array of sample ids used for evaluation",
"items": {
"type": [
"integer",
"string"
]
}
},
"additional_details": {
"$ref": "#/$defs/additional_properties_object"
}
}
},
"source_data_private": {
"type": "object",
"description": "Generic source data when neither URL array nor HuggingFace dataset applies",
"required": [
"dataset_name",
"source_type"
],
"properties": {
"dataset_name": {
"type": "string",
"description": "Name of the source dataset"
},
"source_type": {
"const": "other"
},
"additional_details": {
"$ref": "#/$defs/additional_properties_object"
}
}
}
}
}