File size: 1,957 Bytes
6d55fec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
from pydantic import BaseModel, Field, ConfigDict
from typing import List, Dict, Any, Optional
from enum import Enum
class MetricType(str, Enum):
ACCURACY = "accuracy"
FAITHFULNESS = "faithfulness"
RELEVANCE = "relevance"
TOXICITY = "toxicity"
CONTEXT_PRECISION = "context_precision"
CONTEXT_RECALL = "context_recall"
class APIProvider(str, Enum):
GROQ = "groq"
OPENAI = "openai"
class EvaluationRequest(BaseModel):
model_config = ConfigDict(protected_namespaces=())
questions: List[str] = Field(..., description="Questions to evaluate")
ground_truths: List[str] = Field(..., description="Ground truth answers")
model_responses: Optional[List[str]] = Field(None, description="Model responses")
contexts: Optional[List[str]] = Field(None, description="Contexts for evaluation")
metrics: List[MetricType] = Field(default=["accuracy", "faithfulness", "relevance"])
judge_model: str = Field(default="openai/gpt-oss-20b")
max_concurrent: int = Field(default=5, description="Max concurrent evaluations")
api_provider: APIProvider = Field(default=APIProvider.GROQ, description="API provider for evaluation")
class EvaluationResult(BaseModel):
model_config = ConfigDict(protected_namespaces=())
question: str
ground_truth: str
model_response: str
metrics: Dict[MetricType, float]
explanations: Dict[MetricType, str]
processing_time: float
overall_score: float = Field(..., description="Overall weighted score (0-100)")
class EvaluationSummary(BaseModel):
model_config = ConfigDict(protected_namespaces=())
total_questions: int
average_scores: Dict[MetricType, float]
individual_results: List[EvaluationResult]
total_processing_time: float
model_used: str
api_provider: str
overall_score: float = Field(..., description="Overall weighted score across all questions") |