Spaces:
Running
Running
| { | |
| "$schema": "http://json-schema.org/draft-07/schema#", | |
| "version": "0.2.0", | |
| "type": "object", | |
| "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics", | |
| "required": [ | |
| "schema_version", | |
| "evaluation_id", | |
| "retrieved_timestamp", | |
| "source_metadata", | |
| "model_info", | |
| "evaluation_results" | |
| ], | |
| "additionalProperties": false, | |
| "properties": { | |
| "schema_version": { | |
| "type": "string", | |
| "description": "Version of the schema used for this evaluation data" | |
| }, | |
| "evaluation_id": { | |
| "type": "string", | |
| "description": "Unique identifier for this specific evaluation run. Use eval_name/model_id/retrieved_timestamp format" | |
| }, | |
| "evaluation_timestamp": { | |
| "type": "string", | |
| "description": "Timestamp for when the evaluation was run" | |
| }, | |
| "retrieved_timestamp": { | |
| "type": "string", | |
| "description": "Timestamp for when this record was created - using Unix Epoch time format" | |
| }, | |
| "source_metadata": { | |
| "type": "object", | |
| "description": "Metadata about the source of the leaderboard data", | |
| "required": [ | |
| "source_type", | |
| "source_organization_name", | |
| "evaluator_relationship" | |
| ], | |
| "properties": { | |
| "source_name": { | |
| "type": "string", | |
| "description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation)." | |
| }, | |
| "source_type": { | |
| "type": "string", | |
| "enum": [ | |
| "documentation", | |
| "evaluation_run" | |
| ], | |
| "description": "Whether the data comes from a direct evaluation run or from documentation" | |
| }, | |
| "source_organization_name": { | |
| "type": "string", | |
| "description": "Name of the organization that provides the data" | |
| }, | |
| "source_organization_url": { | |
| "type": "string", | |
| "description": "URL for the organization that provides the data" | |
| }, | |
| "source_organization_logo_url": { | |
| "type": "string", | |
| "description": "URL for the Logo for the organization that provides the data" | |
| }, | |
| "evaluator_relationship": { | |
| "type": "string", | |
| "description": "Relationship between the evaluator and the model", | |
| "enum": [ | |
| "first_party", | |
| "third_party", | |
| "collaborative", | |
| "other" | |
| ] | |
| } | |
| } | |
| }, | |
| "model_info": { | |
| "$ref": "#/$defs/model_info" | |
| }, | |
| "evaluation_results": { | |
| "type": "array", | |
| "description": "Array of evaluation results", | |
| "items": { | |
| "type": "object", | |
| "required": [ | |
| "evaluation_name", | |
| "source_data", | |
| "metric_config", | |
| "score_details" | |
| ], | |
| "properties": { | |
| "evaluation_name": { | |
| "type": "string", | |
| "description": "Name of the evaluation" | |
| }, | |
| "source_data": { | |
| "description": "Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.", | |
| "oneOf": [ | |
| { | |
| "$ref": "#/$defs/source_data_url" | |
| }, | |
| { | |
| "$ref": "#/$defs/source_data_hf" | |
| }, | |
| { | |
| "$ref": "#/$defs/source_data_private" | |
| } | |
| ] | |
| }, | |
| "evaluation_timestamp": { | |
| "type": "string", | |
| "description": "Timestamp for when the evaluations were run" | |
| }, | |
| "metric_config": { | |
| "type": "object", | |
| "description": "Details about the metric", | |
| "required": [ | |
| "lower_is_better" | |
| ], | |
| "properties": { | |
| "evaluation_description": { | |
| "type": "string", | |
| "description": "Description of the evaluation" | |
| }, | |
| "lower_is_better": { | |
| "type": "boolean", | |
| "description": "Whether a lower score is better" | |
| }, | |
| "score_type": { | |
| "type": "string", | |
| "description": "Type of score", | |
| "enum": [ | |
| "binary", | |
| "continuous", | |
| "levels" | |
| ] | |
| }, | |
| "level_names": { | |
| "type": "array", | |
| "description": "Names of the score levels", | |
| "items": { | |
| "type": "string" | |
| } | |
| }, | |
| "level_metadata": { | |
| "type": "array", | |
| "description": "Additional Description for each Score Level", | |
| "items": { | |
| "type": "string" | |
| } | |
| }, | |
| "has_unknown_level": { | |
| "type": "boolean", | |
| "description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown" | |
| }, | |
| "min_score": { | |
| "type": "number", | |
| "description": "Minimum possible score for continuous metric" | |
| }, | |
| "max_score": { | |
| "type": "number", | |
| "description": "Maximum possible score for continuous metric" | |
| }, | |
| "llm_scoring": { | |
| "type": "object", | |
| "description": "Configuration when LLM is used as scorer/judge", | |
| "additionalProperties": true, | |
| "required": [ | |
| "judges", | |
| "input_prompt" | |
| ], | |
| "properties": { | |
| "judges": { | |
| "type": "array", | |
| "description": "LLM judge(s) - single item for judge, multiple for jury", | |
| "items": { | |
| "$ref": "#/$defs/judge_config" | |
| }, | |
| "minItems": 1 | |
| }, | |
| "input_prompt": { | |
| "type": "string", | |
| "description": "Prompt template used for judging" | |
| }, | |
| "aggregation_method": { | |
| "type": "string", | |
| "enum": [ | |
| "majority_vote", | |
| "average", | |
| "weighted_average", | |
| "median" | |
| ], | |
| "description": "How to aggregate scores when multiple judges" | |
| }, | |
| "expert_baseline": { | |
| "type": "number", | |
| "description": "Expert/human baseline score for comparison" | |
| }, | |
| "additional_details": { | |
| "$ref": "#/$defs/additional_properties_object" | |
| } | |
| } | |
| } | |
| }, | |
| "if": { | |
| "properties": { | |
| "score_type": { | |
| "const": "levels" | |
| } | |
| } | |
| }, | |
| "then": { | |
| "required": [ | |
| "level_names", | |
| "has_unknown_level" | |
| ] | |
| }, | |
| "else": { | |
| "if": { | |
| "properties": { | |
| "score_type": { | |
| "const": "continuous" | |
| } | |
| } | |
| }, | |
| "then": { | |
| "required": [ | |
| "min_score", | |
| "max_score" | |
| ] | |
| } | |
| } | |
| }, | |
| "score_details": { | |
| "type" : "object", | |
| "description": "The score for the evaluation and related details", | |
| "required": [ | |
| "score" | |
| ], | |
| "properties": { | |
| "score": { | |
| "type": "number", | |
| "description": "The score for the evaluation" | |
| }, | |
| "details": { | |
| "$ref": "#/$defs/additional_properties_object" | |
| }, | |
| "uncertainty": { | |
| "type": "object", | |
| "description": "Quantification of uncertainty around the reported score", | |
| "properties": { | |
| "standard_error": { | |
| "type": "object", | |
| "description": "Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))", | |
| "properties": { | |
| "value": { | |
| "type": "number", | |
| "description": "The standard error value" | |
| }, | |
| "method": { | |
| "type": "string", | |
| "description": "How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')" | |
| } | |
| }, | |
| "required": ["value"] | |
| }, | |
| "confidence_interval": { | |
| "type": "object", | |
| "description": "Lower and upper bounds for the metric at a given confidence level.", | |
| "properties": { | |
| "lower": { | |
| "type": "number", | |
| "description": "Lower bound of the confidence interval" | |
| }, | |
| "upper": { | |
| "type": "number", | |
| "description": "Upper bound of the confidence interval" | |
| }, | |
| "confidence_level": { | |
| "type": "number", | |
| "description": "Confidence level (e.g. 0.95 for a 95% confidence interval)", | |
| "minimum": 0, | |
| "maximum": 1 | |
| }, | |
| "method": { | |
| "type": "string", | |
| "description": "How the confidence interval was computed" | |
| } | |
| }, | |
| "required": ["lower", "upper"] | |
| }, | |
| "standard_deviation": { | |
| "type": "number", | |
| "description": "Standard deviation of the per-sample scores" | |
| }, | |
| "num_samples": { | |
| "type": "integer", | |
| "description": "Number of samples used to compute the uncertainty estimates" | |
| }, | |
| "num_bootstrap_samples": { | |
| "type": "integer", | |
| "description": "Number of bootstrap resamples used, if bootstrap method was applied" | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "generation_config": { | |
| "type": "object", | |
| "properties": { | |
| "generation_args": { | |
| "type": "object", | |
| "description": "Parameters used to generate results - properties may vary by model type", | |
| "properties": { | |
| "temperature": { | |
| "type": [ | |
| "null", | |
| "number" | |
| ], | |
| "description": "Sampling temperature" | |
| }, | |
| "top_p": { | |
| "type": [ | |
| "null", | |
| "number" | |
| ], | |
| "description": "Nucleus sampling parameter" | |
| }, | |
| "top_k": { | |
| "type": [ | |
| "null", | |
| "number" | |
| ], | |
| "description": "Top-k sampling parameter" | |
| }, | |
| "max_tokens": { | |
| "type": "integer", | |
| "minimum": 1, | |
| "description": "Maximum number of tokens to generate" | |
| }, | |
| "execution_command": { | |
| "type": "string", | |
| "description": "Command used to run the model to generate results" | |
| }, | |
| "reasoning": { | |
| "type": "boolean", | |
| "description": "Whether reasoning orchain-of-thought was used to generate results" | |
| }, | |
| "prompt_template": { | |
| "type": "string", | |
| "description": "Input prompt template for task (should contain agentic info if needed)." | |
| }, | |
| "agentic_eval_config": { | |
| "type": "object", | |
| "description": "General configuration for agentic evaluations.", | |
| "properties": { | |
| "available_tools": { | |
| "type": "array", | |
| "description": "List of all available tools with their configurations", | |
| "items": { | |
| "type": "object", | |
| "properties": { | |
| "name": { | |
| "type": "string", | |
| "description": "e.g. bash, calculator, ..." | |
| }, | |
| "description": { | |
| "type": "string" | |
| }, | |
| "parameters": { | |
| "$ref": "#/$defs/additional_properties_object" | |
| } | |
| } | |
| } | |
| }, | |
| "additional_details": { | |
| "$ref": "#/$defs/additional_properties_object" | |
| } | |
| } | |
| }, | |
| "eval_plan": { | |
| "type": "object", | |
| "description": "Plan (solvers) used in evaluation. Solvers are crucial parts of Inspect evaluations which can serve a wide variety of purposes like providing system prompts, prompt engineering, model generation or multi-turn dialog.", | |
| "properties": { | |
| "name": { | |
| "type": "string" | |
| }, | |
| "steps": { | |
| "type": "array", | |
| "description": "Array of evaluation plan steps", | |
| "items": { | |
| "solver": { | |
| "type": "string", | |
| "description": "Name of solver e.g. system_message, react." | |
| }, | |
| "parameters": { | |
| "$ref": "#/$defs/additional_properties_object" | |
| } | |
| } | |
| }, | |
| "config": { | |
| "$ref": "#/$defs/additional_properties_object" | |
| } | |
| } | |
| }, | |
| "eval_limits": { | |
| "type": "object", | |
| "description": "Listed evaluation limits like time limit, message limit, token limit.", | |
| "properties": { | |
| "time_limit": { | |
| "type": "integer", | |
| "description": "Time limit for evaluation." | |
| }, | |
| "message_limit": { | |
| "type": "integer", | |
| "description": "Message limit for evaluation." | |
| }, | |
| "token_limit": { | |
| "type": "integer", | |
| "description": "Token limit for evaluation." | |
| } | |
| } | |
| }, | |
| "sandbox": { | |
| "type": "object", | |
| "properties": { | |
| "type": { | |
| "type": "string", | |
| "description": "Type of sandbox e.g. docker" | |
| }, | |
| "config": { | |
| "type": "string", | |
| "description": "Config file name/path e.g. compose.yaml. TODO or full config? Not sure based on the Inspect docs" | |
| } | |
| } | |
| }, | |
| "max_attempts": { | |
| "type": "integer", | |
| "description": "Maximum number of submission attempts (default 1).", | |
| "default": 1 | |
| }, | |
| "incorrect_attempt_feedback": { | |
| "type": "string", | |
| "description": "Feedback from the model after incorrect attempt." | |
| } | |
| }, | |
| "additionalProperties": true | |
| }, | |
| "additional_details": { | |
| "$ref": "#/$defs/additional_properties_object" | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "detailed_evaluation_results": { | |
| "description": "Reference to the evaluation results for all individual samples in the evaluation", | |
| "properties": { | |
| "format": { | |
| "type": "string", | |
| "description": "Format of the detailed evaluation results", | |
| "enum": [ | |
| "jsonl", | |
| "json" | |
| ] | |
| }, | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the detailed evaluation results file" | |
| }, | |
| "hash_algorithm": { | |
| "type": "string", | |
| "description": "Hash algorithm used for checksum and sample_hash in instance-level data", | |
| "enum": [ | |
| "sha256", | |
| "md5" | |
| ] | |
| }, | |
| "checksum": { | |
| "type": "string", | |
| "description": "Checksum value of the file" | |
| }, | |
| "total_rows": { | |
| "type": "integer", | |
| "description": "Total number of rows in the detailed evaluation results file" | |
| } | |
| } | |
| } | |
| }, | |
| "$defs": { | |
| "additional_properties_object": { | |
| "type": "object", | |
| "description": "Additional parameters (key-value object)", | |
| "additionalProperties": true | |
| }, | |
| "judge_config": { | |
| "type": "object", | |
| "description": "Configuration for a single LLM judge/juror", | |
| "required": [ | |
| "model_info" | |
| ], | |
| "properties": { | |
| "model_info": { | |
| "$ref": "#/$defs/model_info" | |
| }, | |
| "temperature": { | |
| "type": "number" | |
| }, | |
| "weight": { | |
| "type": "number", | |
| "description": "Weight of this judge's score in aggregation (used in jury)" | |
| } | |
| } | |
| }, | |
| "model_info": { | |
| "type": "object", | |
| "description": "Complete model specification including basic information, technical configuration and inference settings", | |
| "required": [ | |
| "name", | |
| "id" | |
| ], | |
| "properties": { | |
| "name": { | |
| "type": "string", | |
| "description": "Model name provided by evaluation source" | |
| }, | |
| "id": { | |
| "type": "string", | |
| "description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)" | |
| }, | |
| "developer": { | |
| "type": "string", | |
| "description": "Name of organization that provides the model (e.g. 'OpenAI')" | |
| }, | |
| "inference_platform": { | |
| "type": "string", | |
| "description": "Name of inference platform which provides an access to models by API to run the evaluations or provides models weights to run them locally (e.g. HuggingFace, Bedrock, Together AI)" | |
| }, | |
| "inference_engine": { | |
| "type": "object", | |
| "description": "Name of inference engine which provides an access to optimized models to use them for local evaluations (e.g. vLLM, Ollama).", | |
| "properties": { | |
| "name": { | |
| "type": "string", | |
| "description": "Name of the inference engine" | |
| }, | |
| "version": { | |
| "type": "string", | |
| "description": "Version of the inference engine" | |
| } | |
| } | |
| }, | |
| "additional_details": { | |
| "$ref": "#/$defs/additional_properties_object" | |
| } | |
| } | |
| }, | |
| "source_data_url": { | |
| "type": "object", | |
| "description": "URL source for the evaluation data", | |
| "required": [ | |
| "dataset_name", | |
| "source_type", | |
| "url" | |
| ], | |
| "additionalProperties": true, | |
| "properties": { | |
| "dataset_name": { | |
| "type": "string", | |
| "description": "Name of the source dataset" | |
| }, | |
| "source_type": { | |
| "const": "url" | |
| }, | |
| "url": { | |
| "type": "array", | |
| "items": { | |
| "type": "string" | |
| }, | |
| "minItems": 1, | |
| "description": "URL(s) for the source of the evaluation data" | |
| }, | |
| "additional_details": { | |
| "$ref": "#/$defs/additional_properties_object" | |
| } | |
| } | |
| }, | |
| "source_data_hf": { | |
| "type": "object", | |
| "description": "Details about HuggingFace dataset used for evaluation", | |
| "required": [ | |
| "dataset_name", | |
| "source_type" | |
| ], | |
| "additionalProperties": true, | |
| "properties": { | |
| "dataset_name": { | |
| "type": "string", | |
| "description": "Name of the source dataset" | |
| }, | |
| "source_type": { | |
| "const": "hf_dataset" | |
| }, | |
| "hf_repo": { | |
| "type": "string", | |
| "description": "HuggingFace repository identifier" | |
| }, | |
| "hf_split": { | |
| "type": "string", | |
| "description": "One of train, val or test." | |
| }, | |
| "samples_number": { | |
| "type": "integer", | |
| "description": "Number of samples in the dataset" | |
| }, | |
| "sample_ids": { | |
| "type": "array", | |
| "description": "Array of sample ids used for evaluation", | |
| "items": { | |
| "type": [ | |
| "integer", | |
| "string" | |
| ] | |
| } | |
| }, | |
| "additional_details": { | |
| "$ref": "#/$defs/additional_properties_object" | |
| } | |
| } | |
| }, | |
| "source_data_private": { | |
| "type": "object", | |
| "description": "Generic source data when neither URL array nor HuggingFace dataset applies", | |
| "required": [ | |
| "dataset_name", | |
| "source_type" | |
| ], | |
| "properties": { | |
| "dataset_name": { | |
| "type": "string", | |
| "description": "Name of the source dataset" | |
| }, | |
| "source_type": { | |
| "const": "other" | |
| }, | |
| "additional_details": { | |
| "$ref": "#/$defs/additional_properties_object" | |
| } | |
| } | |
| } | |
| } | |
| } | |