Spaces:

evaleval
/

eee_validator

Running

App Files Files Community

eee_validator / eval.schema.json

deepmage121

initial commit, space validation stuff

92ea780 9 days ago

raw

history blame contribute delete

31.9 kB

	{
	"$schema": "http://json-schema.org/draft-07/schema#",
	"version": "0.2.0",
	"type": "object",
	"description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
	"required": [
	"schema_version",
	"evaluation_id",
	"retrieved_timestamp",
	"source_metadata",
	"model_info",
	"evaluation_results"
	],
	"additionalProperties": false,
	"properties": {
	"schema_version": {
	"type": "string",
	"description": "Version of the schema used for this evaluation data"
	},
	"evaluation_id": {
	"type": "string",
	"description": "Unique identifier for this specific evaluation run. Use eval_name/model_id/retrieved_timestamp format"
	},
	"evaluation_timestamp": {
	"type": "string",
	"description": "Timestamp for when the evaluation was run"
	},
	"retrieved_timestamp": {
	"type": "string",
	"description": "Timestamp for when this record was created - using Unix Epoch time format"
	},
	"source_metadata": {
	"type": "object",
	"description": "Metadata about the source of the leaderboard data",
	"required": [
	"source_type",
	"source_organization_name",
	"evaluator_relationship"
	],
	"properties": {
	"source_name": {
	"type": "string",
	"description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation)."
	},
	"source_type": {
	"type": "string",
	"enum": [
	"documentation",
	"evaluation_run"
	],
	"description": "Whether the data comes from a direct evaluation run or from documentation"
	},
	"source_organization_name": {
	"type": "string",
	"description": "Name of the organization that provides the data"
	},
	"source_organization_url": {
	"type": "string",
	"description": "URL for the organization that provides the data"
	},
	"source_organization_logo_url": {
	"type": "string",
	"description": "URL for the Logo for the organization that provides the data"
	},
	"evaluator_relationship": {
	"type": "string",
	"description": "Relationship between the evaluator and the model",
	"enum": [
	"first_party",
	"third_party",
	"collaborative",
	"other"
	]
	}
	}
	},
	"model_info": {
	"$ref": "#/$defs/model_info"
	},
	"evaluation_results": {
	"type": "array",
	"description": "Array of evaluation results",
	"items": {
	"type": "object",
	"required": [
	"evaluation_name",
	"source_data",
	"metric_config",
	"score_details"
	],
	"properties": {
	"evaluation_name": {
	"type": "string",
	"description": "Name of the evaluation"
	},
	"source_data": {
	"description": "Source of dataset for this evaluation: URL, HuggingFace dataset, or private/custom dataset.",
	"oneOf": [
	{
	"$ref": "#/$defs/source_data_url"
	},
	{
	"$ref": "#/$defs/source_data_hf"
	},
	{
	"$ref": "#/$defs/source_data_private"
	}
	]
	},
	"evaluation_timestamp": {
	"type": "string",
	"description": "Timestamp for when the evaluations were run"
	},
	"metric_config": {
	"type": "object",
	"description": "Details about the metric",
	"required": [
	"lower_is_better"
	],
	"properties": {
	"evaluation_description": {
	"type": "string",
	"description": "Description of the evaluation"
	},
	"lower_is_better": {
	"type": "boolean",
	"description": "Whether a lower score is better"
	},
	"score_type": {
	"type": "string",
	"description": "Type of score",
	"enum": [
	"binary",
	"continuous",
	"levels"
	]
	},
	"level_names": {
	"type": "array",
	"description": "Names of the score levels",
	"items": {
	"type": "string"
	}
	},
	"level_metadata": {
	"type": "array",
	"description": "Additional Description for each Score Level",
	"items": {
	"type": "string"
	}
	},
	"has_unknown_level": {
	"type": "boolean",
	"description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown"
	},
	"min_score": {
	"type": "number",
	"description": "Minimum possible score for continuous metric"
	},
	"max_score": {
	"type": "number",
	"description": "Maximum possible score for continuous metric"
	},
	"llm_scoring": {
	"type": "object",
	"description": "Configuration when LLM is used as scorer/judge",
	"additionalProperties": true,
	"required": [
	"judges",
	"input_prompt"
	],
	"properties": {
	"judges": {
	"type": "array",
	"description": "LLM judge(s) - single item for judge, multiple for jury",
	"items": {
	"$ref": "#/$defs/judge_config"
	},
	"minItems": 1
	},
	"input_prompt": {
	"type": "string",
	"description": "Prompt template used for judging"
	},
	"aggregation_method": {
	"type": "string",
	"enum": [
	"majority_vote",
	"average",
	"weighted_average",
	"median"
	],
	"description": "How to aggregate scores when multiple judges"
	},
	"expert_baseline": {
	"type": "number",
	"description": "Expert/human baseline score for comparison"
	},
	"additional_details": {
	"$ref": "#/$defs/additional_properties_object"
	}
	}
	}
	},
	"if": {
	"properties": {
	"score_type": {
	"const": "levels"
	}
	}
	},
	"then": {
	"required": [
	"level_names",
	"has_unknown_level"
	]
	},
	"else": {
	"if": {
	"properties": {
	"score_type": {
	"const": "continuous"
	}
	}
	},
	"then": {
	"required": [
	"min_score",
	"max_score"
	]
	}
	}
	},
	"score_details": {
	"type" : "object",
	"description": "The score for the evaluation and related details",
	"required": [
	"score"
	],
	"properties": {
	"score": {
	"type": "number",
	"description": "The score for the evaluation"
	},
	"details": {
	"$ref": "#/$defs/additional_properties_object"
	},
	"uncertainty": {
	"type": "object",
	"description": "Quantification of uncertainty around the reported score",
	"properties": {
	"standard_error": {
	"type": "object",
	"description": "Standard error of the score estimate (SE_mean = standard_deviation / sqrt(num_samples))",
	"properties": {
	"value": {
	"type": "number",
	"description": "The standard error value"
	},
	"method": {
	"type": "string",
	"description": "How the standard error was computed (e.g. 'analytic', 'bootstrap', 'jackknife')"
	}
	},
	"required": ["value"]
	},
	"confidence_interval": {
	"type": "object",
	"description": "Lower and upper bounds for the metric at a given confidence level.",
	"properties": {
	"lower": {
	"type": "number",
	"description": "Lower bound of the confidence interval"
	},
	"upper": {
	"type": "number",
	"description": "Upper bound of the confidence interval"
	},
	"confidence_level": {
	"type": "number",
	"description": "Confidence level (e.g. 0.95 for a 95% confidence interval)",
	"minimum": 0,
	"maximum": 1
	},
	"method": {
	"type": "string",
	"description": "How the confidence interval was computed"
	}
	},
	"required": ["lower", "upper"]
	},
	"standard_deviation": {
	"type": "number",
	"description": "Standard deviation of the per-sample scores"
	},
	"num_samples": {
	"type": "integer",
	"description": "Number of samples used to compute the uncertainty estimates"
	},
	"num_bootstrap_samples": {
	"type": "integer",
	"description": "Number of bootstrap resamples used, if bootstrap method was applied"
	}
	}
	}
	}
	},
	"generation_config": {
	"type": "object",
	"properties": {
	"generation_args": {
	"type": "object",
	"description": "Parameters used to generate results - properties may vary by model type",
	"properties": {
	"temperature": {
	"type": [
	"null",
	"number"
	],
	"description": "Sampling temperature"
	},
	"top_p": {
	"type": [
	"null",
	"number"
	],
	"description": "Nucleus sampling parameter"
	},
	"top_k": {
	"type": [
	"null",
	"number"
	],
	"description": "Top-k sampling parameter"
	},
	"max_tokens": {
	"type": "integer",
	"minimum": 1,
	"description": "Maximum number of tokens to generate"
	},
	"execution_command": {
	"type": "string",
	"description": "Command used to run the model to generate results"
	},
	"reasoning": {
	"type": "boolean",
	"description": "Whether reasoning orchain-of-thought was used to generate results"
	},
	"prompt_template": {
	"type": "string",
	"description": "Input prompt template for task (should contain agentic info if needed)."
	},
	"agentic_eval_config": {
	"type": "object",
	"description": "General configuration for agentic evaluations.",
	"properties": {
	"available_tools": {
	"type": "array",
	"description": "List of all available tools with their configurations",
	"items": {
	"type": "object",
	"properties": {
	"name": {
	"type": "string",
	"description": "e.g. bash, calculator, ..."
	},
	"description": {
	"type": "string"
	},
	"parameters": {
	"$ref": "#/$defs/additional_properties_object"
	}
	}
	}
	},
	"additional_details": {
	"$ref": "#/$defs/additional_properties_object"
	}
	}
	},
	"eval_plan": {
	"type": "object",
	"description": "Plan (solvers) used in evaluation. Solvers are crucial parts of Inspect evaluations which can serve a wide variety of purposes like providing system prompts, prompt engineering, model generation or multi-turn dialog.",
	"properties": {
	"name": {
	"type": "string"
	},
	"steps": {
	"type": "array",
	"description": "Array of evaluation plan steps",
	"items": {
	"solver": {
	"type": "string",
	"description": "Name of solver e.g. system_message, react."
	},
	"parameters": {
	"$ref": "#/$defs/additional_properties_object"
	}
	}
	},
	"config": {
	"$ref": "#/$defs/additional_properties_object"
	}
	}
	},
	"eval_limits": {
	"type": "object",
	"description": "Listed evaluation limits like time limit, message limit, token limit.",
	"properties": {
	"time_limit": {
	"type": "integer",
	"description": "Time limit for evaluation."
	},
	"message_limit": {
	"type": "integer",
	"description": "Message limit for evaluation."
	},
	"token_limit": {
	"type": "integer",
	"description": "Token limit for evaluation."
	}
	}
	},
	"sandbox": {
	"type": "object",
	"properties": {
	"type": {
	"type": "string",
	"description": "Type of sandbox e.g. docker"
	},
	"config": {
	"type": "string",
	"description": "Config file name/path e.g. compose.yaml. TODO or full config? Not sure based on the Inspect docs"
	}
	}
	},
	"max_attempts": {
	"type": "integer",
	"description": "Maximum number of submission attempts (default 1).",
	"default": 1
	},
	"incorrect_attempt_feedback": {
	"type": "string",
	"description": "Feedback from the model after incorrect attempt."
	}
	},
	"additionalProperties": true
	},
	"additional_details": {
	"$ref": "#/$defs/additional_properties_object"
	}
	}
	}
	}
	}
	},
	"detailed_evaluation_results": {
	"description": "Reference to the evaluation results for all individual samples in the evaluation",
	"properties": {
	"format": {
	"type": "string",
	"description": "Format of the detailed evaluation results",
	"enum": [
	"jsonl",
	"json"
	]
	},
	"file_path": {
	"type": "string",
	"description": "Path to the detailed evaluation results file"
	},
	"hash_algorithm": {
	"type": "string",
	"description": "Hash algorithm used for checksum and sample_hash in instance-level data",
	"enum": [
	"sha256",
	"md5"
	]
	},
	"checksum": {
	"type": "string",
	"description": "Checksum value of the file"
	},
	"total_rows": {
	"type": "integer",
	"description": "Total number of rows in the detailed evaluation results file"
	}
	}
	}
	},
	"$defs": {
	"additional_properties_object": {
	"type": "object",
	"description": "Additional parameters (key-value object)",
	"additionalProperties": true
	},
	"judge_config": {
	"type": "object",
	"description": "Configuration for a single LLM judge/juror",
	"required": [
	"model_info"
	],
	"properties": {
	"model_info": {
	"$ref": "#/$defs/model_info"
	},
	"temperature": {
	"type": "number"
	},
	"weight": {
	"type": "number",
	"description": "Weight of this judge's score in aggregation (used in jury)"
	}
	}
	},
	"model_info": {
	"type": "object",
	"description": "Complete model specification including basic information, technical configuration and inference settings",
	"required": [
	"name",
	"id"
	],
	"properties": {
	"name": {
	"type": "string",
	"description": "Model name provided by evaluation source"
	},
	"id": {
	"type": "string",
	"description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)"
	},
	"developer": {
	"type": "string",
	"description": "Name of organization that provides the model (e.g. 'OpenAI')"
	},
	"inference_platform": {
	"type": "string",
	"description": "Name of inference platform which provides an access to models by API to run the evaluations or provides models weights to run them locally (e.g. HuggingFace, Bedrock, Together AI)"
	},
	"inference_engine": {
	"type": "object",
	"description": "Name of inference engine which provides an access to optimized models to use them for local evaluations (e.g. vLLM, Ollama).",
	"properties": {
	"name": {
	"type": "string",
	"description": "Name of the inference engine"
	},
	"version": {
	"type": "string",
	"description": "Version of the inference engine"
	}
	}
	},
	"additional_details": {
	"$ref": "#/$defs/additional_properties_object"
	}
	}
	},
	"source_data_url": {
	"type": "object",
	"description": "URL source for the evaluation data",
	"required": [
	"dataset_name",
	"source_type",
	"url"
	],
	"additionalProperties": true,
	"properties": {
	"dataset_name": {
	"type": "string",
	"description": "Name of the source dataset"
	},
	"source_type": {
	"const": "url"
	},
	"url": {
	"type": "array",
	"items": {
	"type": "string"
	},
	"minItems": 1,
	"description": "URL(s) for the source of the evaluation data"
	},
	"additional_details": {
	"$ref": "#/$defs/additional_properties_object"
	}
	}
	},
	"source_data_hf": {
	"type": "object",
	"description": "Details about HuggingFace dataset used for evaluation",
	"required": [
	"dataset_name",
	"source_type"
	],
	"additionalProperties": true,
	"properties": {
	"dataset_name": {
	"type": "string",
	"description": "Name of the source dataset"
	},
	"source_type": {
	"const": "hf_dataset"
	},
	"hf_repo": {
	"type": "string",
	"description": "HuggingFace repository identifier"
	},
	"hf_split": {
	"type": "string",
	"description": "One of train, val or test."
	},
	"samples_number": {
	"type": "integer",
	"description": "Number of samples in the dataset"
	},
	"sample_ids": {
	"type": "array",
	"description": "Array of sample ids used for evaluation",
	"items": {
	"type": [
	"integer",
	"string"
	]
	}
	},
	"additional_details": {
	"$ref": "#/$defs/additional_properties_object"
	}
	}
	},
	"source_data_private": {
	"type": "object",
	"description": "Generic source data when neither URL array nor HuggingFace dataset applies",
	"required": [
	"dataset_name",
	"source_type"
	],
	"properties": {
	"dataset_name": {
	"type": "string",
	"description": "Name of the source dataset"
	},
	"source_type": {
	"const": "other"
	},
	"additional_details": {
	"$ref": "#/$defs/additional_properties_object"
	}
	}
	}
	}
	}