| from pydantic import BaseModel, Field, ConfigDict | |
| from typing import List, Dict, Any, Optional | |
| from enum import Enum | |
| class MetricType(str, Enum): | |
| ACCURACY = "accuracy" | |
| FAITHFULNESS = "faithfulness" | |
| RELEVANCE = "relevance" | |
| TOXICITY = "toxicity" | |
| CONTEXT_PRECISION = "context_precision" | |
| CONTEXT_RECALL = "context_recall" | |
| class APIProvider(str, Enum): | |
| GROQ = "groq" | |
| OPENAI = "openai" | |
| class EvaluationRequest(BaseModel): | |
| model_config = ConfigDict(protected_namespaces=()) | |
| questions: List[str] = Field(..., description="Questions to evaluate") | |
| ground_truths: List[str] = Field(..., description="Ground truth answers") | |
| model_responses: Optional[List[str]] = Field(None, description="Model responses") | |
| contexts: Optional[List[str]] = Field(None, description="Contexts for evaluation") | |
| metrics: List[MetricType] = Field(default=["accuracy", "faithfulness", "relevance"]) | |
| judge_model: str = Field(default="openai/gpt-oss-20b") | |
| max_concurrent: int = Field(default=5, description="Max concurrent evaluations") | |
| api_provider: APIProvider = Field(default=APIProvider.GROQ, description="API provider for evaluation") | |
| class EvaluationResult(BaseModel): | |
| model_config = ConfigDict(protected_namespaces=()) | |
| question: str | |
| ground_truth: str | |
| model_response: str | |
| metrics: Dict[MetricType, float] | |
| explanations: Dict[MetricType, str] | |
| processing_time: float | |
| overall_score: float = Field(..., description="Overall weighted score (0-100)") | |
| class EvaluationSummary(BaseModel): | |
| model_config = ConfigDict(protected_namespaces=()) | |
| total_questions: int | |
| average_scores: Dict[MetricType, float] | |
| individual_results: List[EvaluationResult] | |
| total_processing_time: float | |
| model_used: str | |
| api_provider: str | |
| overall_score: float = Field(..., description="Overall weighted score across all questions") |