{ "$schema": "http://json-schema.org/draft-07/schema#", "version": "0.2.0", "type": "object", "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics", "required": [ "schema_version", "evaluation_id", "retrieved_timestamp", "source_metadata", "model_info", "evaluation_results" ], "additionalProperties": false, "properties": { "schema_version": { "type": "string", "description": "Version of the schema used for this evaluation data" }, "evaluation_id": { "type": "string", "description": "Unique identifier for this specific evaluation run. Use eval_name/model_id/retrieved_timestamp format" }, "evaluation_timestamp": { "type": "string", "description": "Timestamp for when the evaluation was run" }, "retrieved_timestamp": { "type": "string", "description": "Timestamp for when this record was created - using Unix Epoch time format" }, "source_metadata": { "type": "object", "description": "Metadata about the source of the leaderboard data", "required": [ "source_type", "source_organization_name", "evaluator_relationship" ], "properties": { "source_name": { "type": "string", "description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation)." }, "source_type": { "type": "string", "enum": [ "documentation", "evaluation_run" ], "description": "Whether the data comes from a direct evaluation run or from documentation" }, "source_organization_name": { "type": "string", "description": "Name of the organization that provides the data" }, "source_organization_url": { "type": "string", "description": "URL for the organization that provides the data" }, "source_organization_logo_url": { "type": "string", "description": "URL for the Logo for the organization that provides the data" }, "evaluator_relationship": { "type": "string", "description": "Relationship between the evaluator and the model", "enum": [ "first_party", "third_party", "collaborative", "other" ] } } }, "model_info": { "$ref": "#/$defs/model_info" }, "evaluation_results": { "type": "array", "description": "Array of evaluation results", "items": { "type": "object", "required": [ "evaluation_name", "source_data", "metric_config", "score_details" ], "properties": { "evaluation_name": { "type": "string", "description": "Name of the evaluation" }, "source_data": { "description": "Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.", "oneOf": [ { "$ref": "#/$defs/source_data_url" }, { "$ref": "#/$defs/source_data_hf" }, { "$ref": "#/$defs/source_data_private" } ] }, "evaluation_timestamp": { "type": "string", "description": "Timestamp for when the evaluations were run" }, "metric_config": { "type": "object", "description": "Details about the metric", "required": [ "lower_is_better" ], "properties": { "evaluation_description": { "type": "string", "description": "Description of the evaluation" }, "lower_is_better": { "type": "boolean", "description": "Whether a lower score is better" }, "score_type": { "type": "string", "description": "Type of score", "enum": [ "binary", "continuous", "levels" ] }, "level_names": { "type": "array", "description": "Names of the score levels", "items": { "type": "string" } }, "level_metadata": { "type": "array", "description": "Additional Description for each Score Level", "items": { "type": "string" } }, "has_unknown_level": { "type": "boolean", "description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown" }, "min_score": { "type": "number", "description": "Minimum possible score for continuous metric" }, "max_score": { "type": "number", "description": "Maximum possible score for continuous metric" }, "llm_scoring": { "type": "object", "description": "Configuration when LLM is used as scorer/judge", "additionalProperties": true, "required": [ "judges", "input_prompt" ], "properties": { "judges": { "type": "array", "description": "LLM judge(s) - single item for judge, multiple for jury", "items": { "$ref": "#/$defs/judge_config" }, "minItems": 1 }, "input_prompt": { "type": "string", "description": "Prompt template used for judging" }, "aggregation_method": { "type": "string", "enum": [ "majority_vote", "average", "weighted_average", "median" ], "description": "How to aggregate scores when multiple judges" }, "expert_baseline": { "type": "number", "description": "Expert/human baseline score for comparison" }, "additional_details": { "$ref": "#/$defs/additional_properties_object" } } } }, "if": { "properties": { "score_type": { "const": "levels" } } }, "then": { "required": [ "level_names", "has_unknown_level" ] }, "else": { "if": { "properties": { "score_type": { "const": "continuous" } } }, "then": { "required": [ "min_score", "max_score" ] } } }, "score_details": { "type" : "object", "description": "The score for the evaluation and related details", "required": [ "score" ], "properties": { "score": { "type": "number", "description": "The score for the evaluation" }, "details": { "$ref": "#/$defs/additional_properties_object" }, "uncertainty": { "type": "object", "description": "Quantification of uncertainty around the reported score", "properties": { "standard_error": { "type": "object", "description": "Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))", "properties": { "value": { "type": "number", "description": "The standard error value" }, "method": { "type": "string", "description": "How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')" } }, "required": ["value"] }, "confidence_interval": { "type": "object", "description": "Lower and upper bounds for the metric at a given confidence level.", "properties": { "lower": { "type": "number", "description": "Lower bound of the confidence interval" }, "upper": { "type": "number", "description": "Upper bound of the confidence interval" }, "confidence_level": { "type": "number", "description": "Confidence level (e.g. 0.95 for a 95% confidence interval)", "minimum": 0, "maximum": 1 }, "method": { "type": "string", "description": "How the confidence interval was computed" } }, "required": ["lower", "upper"] }, "standard_deviation": { "type": "number", "description": "Standard deviation of the per-sample scores" }, "num_samples": { "type": "integer", "description": "Number of samples used to compute the uncertainty estimates" }, "num_bootstrap_samples": { "type": "integer", "description": "Number of bootstrap resamples used, if bootstrap method was applied" } } } } }, "generation_config": { "type": "object", "properties": { "generation_args": { "type": "object", "description": "Parameters used to generate results - properties may vary by model type", "properties": { "temperature": { "type": [ "null", "number" ], "description": "Sampling temperature" }, "top_p": { "type": [ "null", "number" ], "description": "Nucleus sampling parameter" }, "top_k": { "type": [ "null", "number" ], "description": "Top-k sampling parameter" }, "max_tokens": { "type": "integer", "minimum": 1, "description": "Maximum number of tokens to generate" }, "execution_command": { "type": "string", "description": "Command used to run the model to generate results" }, "reasoning": { "type": "boolean", "description": "Whether reasoning orchain-of-thought was used to generate results" }, "prompt_template": { "type": "string", "description": "Input prompt template for task (should contain agentic info if needed)." }, "agentic_eval_config": { "type": "object", "description": "General configuration for agentic evaluations.", "properties": { "available_tools": { "type": "array", "description": "List of all available tools with their configurations", "items": { "type": "object", "properties": { "name": { "type": "string", "description": "e.g. bash, calculator, ..." }, "description": { "type": "string" }, "parameters": { "$ref": "#/$defs/additional_properties_object" } } } }, "additional_details": { "$ref": "#/$defs/additional_properties_object" } } }, "eval_plan": { "type": "object", "description": "Plan (solvers) used in evaluation. Solvers are crucial parts of Inspect evaluations which can serve a wide variety of purposes like providing system prompts, prompt engineering, model generation or multi-turn dialog.", "properties": { "name": { "type": "string" }, "steps": { "type": "array", "description": "Array of evaluation plan steps", "items": { "solver": { "type": "string", "description": "Name of solver e.g. system_message, react." }, "parameters": { "$ref": "#/$defs/additional_properties_object" } } }, "config": { "$ref": "#/$defs/additional_properties_object" } } }, "eval_limits": { "type": "object", "description": "Listed evaluation limits like time limit, message limit, token limit.", "properties": { "time_limit": { "type": "integer", "description": "Time limit for evaluation." }, "message_limit": { "type": "integer", "description": "Message limit for evaluation." }, "token_limit": { "type": "integer", "description": "Token limit for evaluation." } } }, "sandbox": { "type": "object", "properties": { "type": { "type": "string", "description": "Type of sandbox e.g. docker" }, "config": { "type": "string", "description": "Config file name/path e.g. compose.yaml. TODO or full config? Not sure based on the Inspect docs" } } }, "max_attempts": { "type": "integer", "description": "Maximum number of submission attempts (default 1).", "default": 1 }, "incorrect_attempt_feedback": { "type": "string", "description": "Feedback from the model after incorrect attempt." } }, "additionalProperties": true }, "additional_details": { "$ref": "#/$defs/additional_properties_object" } } } } } }, "detailed_evaluation_results": { "description": "Reference to the evaluation results for all individual samples in the evaluation", "properties": { "format": { "type": "string", "description": "Format of the detailed evaluation results", "enum": [ "jsonl", "json" ] }, "file_path": { "type": "string", "description": "Path to the detailed evaluation results file" }, "hash_algorithm": { "type": "string", "description": "Hash algorithm used for checksum and sample_hash in instance-level data", "enum": [ "sha256", "md5" ] }, "checksum": { "type": "string", "description": "Checksum value of the file" }, "total_rows": { "type": "integer", "description": "Total number of rows in the detailed evaluation results file" } } } }, "$defs": { "additional_properties_object": { "type": "object", "description": "Additional parameters (key-value object)", "additionalProperties": true }, "judge_config": { "type": "object", "description": "Configuration for a single LLM judge/juror", "required": [ "model_info" ], "properties": { "model_info": { "$ref": "#/$defs/model_info" }, "temperature": { "type": "number" }, "weight": { "type": "number", "description": "Weight of this judge's score in aggregation (used in jury)" } } }, "model_info": { "type": "object", "description": "Complete model specification including basic information, technical configuration and inference settings", "required": [ "name", "id" ], "properties": { "name": { "type": "string", "description": "Model name provided by evaluation source" }, "id": { "type": "string", "description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)" }, "developer": { "type": "string", "description": "Name of organization that provides the model (e.g. 'OpenAI')" }, "inference_platform": { "type": "string", "description": "Name of inference platform which provides an access to models by API to run the evaluations or provides models weights to run them locally (e.g. HuggingFace, Bedrock, Together AI)" }, "inference_engine": { "type": "object", "description": "Name of inference engine which provides an access to optimized models to use them for local evaluations (e.g. vLLM, Ollama).", "properties": { "name": { "type": "string", "description": "Name of the inference engine" }, "version": { "type": "string", "description": "Version of the inference engine" } } }, "additional_details": { "$ref": "#/$defs/additional_properties_object" } } }, "source_data_url": { "type": "object", "description": "URL source for the evaluation data", "required": [ "dataset_name", "source_type", "url" ], "additionalProperties": true, "properties": { "dataset_name": { "type": "string", "description": "Name of the source dataset" }, "source_type": { "const": "url" }, "url": { "type": "array", "items": { "type": "string" }, "minItems": 1, "description": "URL(s) for the source of the evaluation data" }, "additional_details": { "$ref": "#/$defs/additional_properties_object" } } }, "source_data_hf": { "type": "object", "description": "Details about HuggingFace dataset used for evaluation", "required": [ "dataset_name", "source_type" ], "additionalProperties": true, "properties": { "dataset_name": { "type": "string", "description": "Name of the source dataset" }, "source_type": { "const": "hf_dataset" }, "hf_repo": { "type": "string", "description": "HuggingFace repository identifier" }, "hf_split": { "type": "string", "description": "One of train, val or test." }, "samples_number": { "type": "integer", "description": "Number of samples in the dataset" }, "sample_ids": { "type": "array", "description": "Array of sample ids used for evaluation", "items": { "type": [ "integer", "string" ] } }, "additional_details": { "$ref": "#/$defs/additional_properties_object" } } }, "source_data_private": { "type": "object", "description": "Generic source data when neither URL array nor HuggingFace dataset applies", "required": [ "dataset_name", "source_type" ], "properties": { "dataset_name": { "type": "string", "description": "Name of the source dataset" }, "source_type": { "const": "other" }, "additional_details": { "$ref": "#/$defs/additional_properties_object" } } } } }