NIRAJz's picture
Upload 23 files
6d55fec verified
raw
history blame
1.96 kB
from pydantic import BaseModel, Field, ConfigDict
from typing import List, Dict, Any, Optional
from enum import Enum
class MetricType(str, Enum):
ACCURACY = "accuracy"
FAITHFULNESS = "faithfulness"
RELEVANCE = "relevance"
TOXICITY = "toxicity"
CONTEXT_PRECISION = "context_precision"
CONTEXT_RECALL = "context_recall"
class APIProvider(str, Enum):
GROQ = "groq"
OPENAI = "openai"
class EvaluationRequest(BaseModel):
model_config = ConfigDict(protected_namespaces=())
questions: List[str] = Field(..., description="Questions to evaluate")
ground_truths: List[str] = Field(..., description="Ground truth answers")
model_responses: Optional[List[str]] = Field(None, description="Model responses")
contexts: Optional[List[str]] = Field(None, description="Contexts for evaluation")
metrics: List[MetricType] = Field(default=["accuracy", "faithfulness", "relevance"])
judge_model: str = Field(default="openai/gpt-oss-20b")
max_concurrent: int = Field(default=5, description="Max concurrent evaluations")
api_provider: APIProvider = Field(default=APIProvider.GROQ, description="API provider for evaluation")
class EvaluationResult(BaseModel):
model_config = ConfigDict(protected_namespaces=())
question: str
ground_truth: str
model_response: str
metrics: Dict[MetricType, float]
explanations: Dict[MetricType, str]
processing_time: float
overall_score: float = Field(..., description="Overall weighted score (0-100)")
class EvaluationSummary(BaseModel):
model_config = ConfigDict(protected_namespaces=())
total_questions: int
average_scores: Dict[MetricType, float]
individual_results: List[EvaluationResult]
total_processing_time: float
model_used: str
api_provider: str
overall_score: float = Field(..., description="Overall weighted score across all questions")