Spaces:

evaleval
/

eee_validator

Running

File size: 31,881 Bytes

92ea780

{
    "$schema": "http://json-schema.org/draft-07/schema#",
    "version": "0.2.0",
    "type": "object",
    "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
    "required": [
        "schema_version",
        "evaluation_id",
        "retrieved_timestamp",
        "source_metadata",
        "model_info",
        "evaluation_results"
    ],
    "additionalProperties": false,
    "properties": {
        "schema_version": {
            "type": "string",
            "description": "Version of the schema used for this evaluation data"
        },
        "evaluation_id": {
            "type": "string",
            "description": "Unique identifier for this specific evaluation run. Use eval_name/model_id/retrieved_timestamp format"
        },
        "evaluation_timestamp": {
            "type": "string",
            "description": "Timestamp for when the evaluation was run"
        },
        "retrieved_timestamp": {
            "type": "string",
            "description": "Timestamp for when this record was created - using Unix Epoch time format"
        },
        "source_metadata": {
            "type": "object",
            "description": "Metadata about the source of the leaderboard data",
            "required": [
                "source_type",
                "source_organization_name",
                "evaluator_relationship"
            ],
            "properties": {
                "source_name": {
                    "type": "string",
                    "description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation)."
                },
                "source_type": {
                    "type": "string",
                    "enum": [
                        "documentation",
                        "evaluation_run"
                    ],
                    "description": "Whether the data comes from a direct evaluation run or from documentation"
                },
                "source_organization_name": {
                    "type": "string",
                    "description": "Name of the organization that provides the data"
                },
                "source_organization_url": {
                    "type": "string",
                    "description": "URL for the organization that provides the data"
                },
                "source_organization_logo_url": {
                    "type": "string",
                    "description": "URL for the Logo for the organization that provides the data"
                },
                "evaluator_relationship": {
                    "type": "string",
                    "description": "Relationship between the evaluator and the model",
                    "enum": [
                        "first_party",
                        "third_party",
                        "collaborative",
                        "other"
                    ]
                }
            }
        },
        "model_info": {
            "$ref": "#/$defs/model_info"
        },
        "evaluation_results": {
            "type": "array",
            "description": "Array of evaluation results",
            "items": {
                "type": "object",
                "required": [
                    "evaluation_name",
                    "source_data",
                    "metric_config",
                    "score_details"
                ],
                "properties": {
                    "evaluation_name": {
                        "type": "string",
                        "description": "Name of the evaluation"
                    },
                    "source_data": {
                        "description": "Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.",
                        "oneOf": [
                            {
                                "$ref": "#/$defs/source_data_url"
                            },
                            {
                                "$ref": "#/$defs/source_data_hf"
                            },
                            {
                                "$ref": "#/$defs/source_data_private"
                            }
                        ]
                    },
                    "evaluation_timestamp": {
                        "type": "string",
                        "description": "Timestamp for when the evaluations were run"
                    },
                    "metric_config": {
                        "type": "object",
                        "description": "Details about the metric",
                        "required": [
                            "lower_is_better"
                        ],
                        "properties": {
                            "evaluation_description": {
                                "type": "string",
                                "description": "Description of the evaluation"
                            },
                            "lower_is_better": {
                                "type": "boolean",
                                "description": "Whether a lower score is better"
                            },
                            "score_type": {
                                "type": "string",
                                "description": "Type of score",
                                "enum": [
                                    "binary",
                                    "continuous",
                                    "levels"
                                ]
                            },
                            "level_names": {
                                "type": "array",
                                "description": "Names of the score levels",
                                "items": {
                                    "type": "string"
                                }
                            },
                            "level_metadata": {
                                "type": "array",
                                "description": "Additional Description for each Score Level",
                                "items": {
                                    "type": "string"
                                }
                            },
                            "has_unknown_level": {
                                "type": "boolean",
                                "description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown"
                            },
                            "min_score": {
                                "type": "number",
                                "description": "Minimum possible score for continuous metric"
                            },
                            "max_score": {
                                "type": "number",
                                "description": "Maximum possible score for continuous metric"
                            },
                            "llm_scoring": {
                                "type": "object",
                                "description": "Configuration when LLM is used as scorer/judge",
                                "additionalProperties": true,
                                "required": [
                                    "judges",
                                    "input_prompt"
                                ],
                                "properties": {
                                    "judges": {
                                        "type": "array",
                                        "description": "LLM judge(s) - single item for judge, multiple for jury",
                                        "items": {
                                            "$ref": "#/$defs/judge_config"
                                        },
                                        "minItems": 1
                                    },
                                    "input_prompt": {
                                        "type": "string",
                                        "description": "Prompt template used for judging"
                                    },
                                    "aggregation_method": {
                                        "type": "string",
                                        "enum": [
                                            "majority_vote",
                                            "average",
                                            "weighted_average",
                                            "median"
                                        ],
                                        "description": "How to aggregate scores when multiple judges"
                                    },
                                    "expert_baseline": {
                                        "type": "number",
                                        "description": "Expert/human baseline score for comparison"
                                    },
                                    "additional_details": {
                                        "$ref": "#/$defs/additional_properties_object"
                                    }
                                }
                            }
                        },
                        "if": {
                            "properties": {
                                "score_type": {
                                    "const": "levels"
                                }
                            }
                        },
                        "then": {
                            "required": [
                                "level_names",
                                "has_unknown_level"
                            ]
                        },
                        "else": {
                            "if": {
                                "properties": {
                                    "score_type": {
                                        "const": "continuous"
                                    }
                                }
                            },
                            "then": {
                                "required": [
                                    "min_score",
                                    "max_score"
                                ]
                            }
                        }
                    },
                    "score_details": {
                        "type" : "object",
                        "description": "The score for the evaluation and related details",
                        "required": [
                            "score"
                        ],
                        "properties": {
                            "score": {
                                "type": "number",
                                "description": "The score for the evaluation"
                            },
                            "details": {
                                "$ref": "#/$defs/additional_properties_object"
                            },
                            "uncertainty": {
                                "type": "object",
                                "description": "Quantification of uncertainty around the reported score",
                                "properties": {
                                    "standard_error": {
                                        "type": "object",
                                        "description": "Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))",
                                        "properties": {
                                            "value": {
                                                "type": "number",
                                                "description": "The standard error value"
                                            },
                                            "method": {
                                                "type": "string",
                                                "description": "How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')"
                                            }
                                        },
                                        "required": ["value"]
                                    },
                                    "confidence_interval": {
                                        "type": "object",
                                        "description": "Lower and upper bounds for the metric at a given confidence level.",
                                        "properties": {
                                            "lower": {
                                                "type": "number",
                                                "description": "Lower bound of the confidence interval"
                                            },
                                            "upper": {
                                                "type": "number",
                                                "description": "Upper bound of the confidence interval"
                                            },
                                            "confidence_level": {
                                                "type": "number",
                                                "description": "Confidence level (e.g. 0.95 for a 95% confidence interval)",
                                                "minimum": 0,
                                                "maximum": 1
                                            },
                                            "method": {
                                                "type": "string",
                                                "description": "How the confidence interval was computed"
                                            }
                                        },
                                        "required": ["lower", "upper"]
                                    },
                                    "standard_deviation": {
                                        "type": "number",
                                        "description": "Standard deviation of the per-sample scores"
                                    },
                                    "num_samples": {
                                        "type": "integer",
                                        "description": "Number of samples used to compute the uncertainty estimates"
                                    },
                                    "num_bootstrap_samples": {
                                        "type": "integer",
                                        "description": "Number of bootstrap resamples used, if bootstrap method was applied"
                                    }
                                }
                            }
                        }
                    },
                    "generation_config": {
                        "type": "object",
                        "properties": {
                            "generation_args": {
                                "type": "object",
                                "description": "Parameters used to generate results - properties may vary by model type",
                                "properties": {
                                    "temperature": {
                                        "type": [
                                            "null",
                                            "number"
                                        ],
                                        "description": "Sampling temperature"
                                    },
                                    "top_p": {
                                        "type": [
                                            "null",
                                            "number"
                                        ],
                                        "description": "Nucleus sampling parameter"
                                    },
                                    "top_k": {
                                        "type": [
                                            "null",
                                            "number"
                                        ],
                                        "description": "Top-k sampling parameter"
                                    },
                                    "max_tokens": {
                                        "type": "integer",
                                        "minimum": 1,
                                        "description": "Maximum number of tokens to generate"
                                    },
                                    "execution_command": {
                                        "type": "string",
                                        "description": "Command used to run the model to generate results"
                                    },
                                    "reasoning": {
                                        "type": "boolean",
                                        "description": "Whether reasoning orchain-of-thought was used to generate results"
                                    },
                                    "prompt_template": {
                                        "type": "string",
                                        "description": "Input prompt template for task (should contain agentic info if needed)."
                                    },
                                    "agentic_eval_config": {
                                        "type": "object",
                                        "description": "General configuration for agentic evaluations.",
                                        "properties": {
                                            "available_tools": {
                                                "type": "array",
                                                "description": "List of all available tools with their configurations",
                                                "items": {
                                                    "type": "object",
                                                    "properties": {
                                                        "name": {
                                                            "type": "string",
                                                            "description": "e.g. bash, calculator, ..."
                                                        },
                                                        "description": {
                                                            "type": "string"
                                                        },
                                                        "parameters": {
                                                            "$ref": "#/$defs/additional_properties_object"
                                                        }
                                                    }
                                                }
                                            },
                                            "additional_details": {
                                                "$ref": "#/$defs/additional_properties_object"
                                            }
                                        }
                                    },
                                    "eval_plan": {
                                        "type": "object",
                                        "description": "Plan (solvers) used in evaluation. Solvers are crucial parts of Inspect evaluations which can serve a wide variety of purposes like providing system prompts, prompt engineering, model generation or multi-turn dialog.",
                                        "properties": {
                                            "name": {
                                                "type": "string"
                                            },
                                            "steps": {
                                                "type": "array",
                                                "description": "Array of evaluation plan steps",
                                                "items": {
                                                    "solver": {
                                                        "type": "string",
                                                        "description": "Name of solver e.g. system_message, react."
                                                    },
                                                    "parameters": {
                                                        "$ref": "#/$defs/additional_properties_object"
                                                    }
                                                }
                                            },
                                            "config": {
                                                "$ref": "#/$defs/additional_properties_object"
                                            }
                                        }
                                    },
                                    "eval_limits": {
                                        "type": "object",
                                        "description": "Listed evaluation limits like time limit, message limit, token limit.",
                                        "properties": {
                                            "time_limit": {
                                                "type": "integer",
                                                "description": "Time limit for evaluation."
                                            },
                                            "message_limit": {
                                                "type": "integer",
                                                "description": "Message limit for evaluation."
                                            },
                                            "token_limit": {
                                                "type": "integer",
                                                "description": "Token limit for evaluation."
                                            }
                                        }
                                    },
                                    "sandbox": {
                                        "type": "object",
                                        "properties": {
                                            "type": {
                                                "type": "string",
                                                "description": "Type of sandbox e.g. docker"
                                            },
                                            "config": {
                                                "type": "string",
                                                "description": "Config file name/path e.g. compose.yaml. TODO or full config? Not sure based on the Inspect docs"
                                            }
                                        }
                                    },
                                    "max_attempts": {
                                        "type": "integer",
                                        "description": "Maximum number of submission attempts (default 1).",
                                        "default": 1
                                    },
                                    "incorrect_attempt_feedback": {
                                        "type": "string",
                                        "description": "Feedback from the model after incorrect attempt."
                                    }
                                },
                                "additionalProperties": true
                            },
                            "additional_details": {
                                "$ref": "#/$defs/additional_properties_object"
                            }
                        }
                    }
                }
            }
        },
        "detailed_evaluation_results": {
            "description": "Reference to the evaluation results for all individual samples in the evaluation",
            "properties": {
                "format": {
                    "type": "string",
                    "description": "Format of the detailed evaluation results",
                    "enum": [
                        "jsonl",
                        "json"
                    ]
                },
                "file_path": {
                    "type": "string",
                    "description": "Path to the detailed evaluation results file"
                },
                "hash_algorithm": {
                    "type": "string",
                    "description": "Hash algorithm used for checksum and sample_hash in instance-level data",
                    "enum": [
                        "sha256",
                        "md5"
                    ]
                },
                "checksum": {
                    "type": "string",
                    "description": "Checksum value of the file"
                },
                "total_rows": {
                    "type": "integer",
                    "description": "Total number of rows in the detailed evaluation results file"
                }
            }
        }
    },
    "$defs": {
        "additional_properties_object": {
            "type": "object",
            "description": "Additional parameters (key-value object)",
            "additionalProperties": true
        },
        "judge_config": {
            "type": "object",
            "description": "Configuration for a single LLM judge/juror",
            "required": [
                "model_info"
            ],
            "properties": {
                "model_info": {
                    "$ref": "#/$defs/model_info"
                },
                "temperature": {
                    "type": "number"
                },
                "weight": {
                    "type": "number",
                    "description": "Weight of this judge's score in aggregation (used in jury)"
                }
            }
        },
        "model_info": {
            "type": "object",
            "description": "Complete model specification including basic information, technical configuration and inference settings",
            "required": [
                "name",
                "id"
            ],
            "properties": {
                "name": {
                    "type": "string",
                    "description": "Model name provided by evaluation source"
                },
                "id": {
                    "type": "string",
                    "description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)"
                },
                "developer": {
                    "type": "string",
                    "description": "Name of organization that provides the model (e.g. 'OpenAI')"
                },
                "inference_platform": {
                    "type": "string",
                    "description": "Name of inference platform which provides an access to models by API to run the evaluations or provides models weights to run them locally (e.g. HuggingFace, Bedrock, Together AI)"
                },
                "inference_engine": {
                    "type": "object",
                    "description": "Name of inference engine which provides an access to optimized models to use them for local evaluations (e.g. vLLM, Ollama).",
                    "properties": {
                        "name": {
                            "type": "string",
                            "description": "Name of the inference engine"
                        },
                        "version": {
                            "type": "string",
                            "description": "Version of the inference engine"
                        }
                    }
                },
                "additional_details": {
                    "$ref": "#/$defs/additional_properties_object"
                }
            }
        },
        "source_data_url": {
            "type": "object",
            "description": "URL source for the evaluation data",
            "required": [
                "dataset_name",
                "source_type",
                "url"
            ],
            "additionalProperties": true,
            "properties": {
                "dataset_name": {
                    "type": "string",
                    "description": "Name of the source dataset"
                },
                "source_type": {
                    "const": "url"
                },
                "url": {
                    "type": "array",
                    "items": {
                        "type": "string"
                    },
                    "minItems": 1,
                    "description": "URL(s) for the source of the evaluation data"
                },
                "additional_details": {
                    "$ref": "#/$defs/additional_properties_object"
                }
            }
        },
        "source_data_hf": {
            "type": "object",
            "description": "Details about HuggingFace dataset used for evaluation",
            "required": [
                "dataset_name",
                "source_type"
            ],
            "additionalProperties": true,
            "properties": {
                "dataset_name": {
                    "type": "string",
                    "description": "Name of the source dataset"
                },
                "source_type": {
                    "const": "hf_dataset"
                },
                "hf_repo": {
                    "type": "string",
                    "description": "HuggingFace repository identifier"
                },
                "hf_split": {
                    "type": "string",
                    "description": "One of train, val or test."
                },
                "samples_number": {
                    "type": "integer",
                    "description": "Number of samples in the dataset"
                },
                "sample_ids": {
                    "type": "array",
                    "description": "Array of sample ids used for evaluation",
                    "items": {
                        "type": [
                            "integer",
                            "string"
                        ]
                    }
                },
                "additional_details": {
                    "$ref": "#/$defs/additional_properties_object"
                }
            }
        },
        "source_data_private": {
            "type": "object",
            "description": "Generic source data when neither URL array nor HuggingFace dataset applies",
            "required": [
                "dataset_name",
                "source_type"
            ],
            "properties": {
                "dataset_name": {
                    "type": "string",
                    "description": "Name of the source dataset"
                },
                "source_type": {
                    "const": "other"
                },
                "additional_details": {
                    "$ref": "#/$defs/additional_properties_object"
                }
            }
        }
    }
}