IndraneelKumar
Initial search engine commit
266d7bc
from opik.evaluation import models
from opik.evaluation.metrics import GEval
from src.config import settings
from src.utils.logger_util import setup_logging
logger = setup_logging()
# -----------------------
# Evaluation helper
# -----------------------
async def evaluate_metrics(output: str, context: str) -> dict:
"""Evaluate multiple metrics for a given LLM output.
Metrics included: faithfulness, coherence, completeness.
Args:
output (str): The LLM-generated output to evaluate.
context (str): The context used to generate the output.
Returns:
dict: A dictionary with metric names as keys and their evaluation results as values.
"""
settings.openai.api_key = None
logger.info(f"OpenAI key is not set: {settings.openai.api_key is None}")
if not output.strip():
logger.warning("Output is empty. Skipping evaluation.")
return {
"faithfulness": {"score": 0.0, "reason": "Empty output", "failed": True},
"coherence": {"score": 0.0, "reason": "Empty output", "failed": True},
"completeness": {"score": 0.0, "reason": "Empty output", "failed": True},
}
if not getattr(settings.openai, "api_key", None):
logger.info("OpenAI API key not set. Skipping metrics evaluation.")
return {
"faithfulness": {"score": None, "reason": "Skipped – no API key", "failed": True},
"coherence": {"score": None, "reason": "Skipped – no API key", "failed": True},
"completeness": {"score": None, "reason": "Skipped – no API key", "failed": True},
}
judge_model = models.LiteLLMChatModel(
model_name="gpt-4o", # gpt-4o, gpt-5-mini
api_key=settings.openai.api_key,
)
metric_configs = {
"faithfulness": (
(
"You are an expert judge tasked with evaluating whether an AI-generated answer is "
"faithful to the provided Substack excerpts."
),
(
"The OUTPUT must not introduce new information and beyond "
"what is contained in the CONTEXT. "
"All claims in the OUTPUT should be directly supported by the CONTEXT."
),
),
"coherence": (
(
"You are an expert judge tasked with evaluating whether an AI-generated answer is "
"logically coherent."
),
"The answer should be well-structured, readable, and maintain consistent reasoning.",
),
"completeness": (
(
"You are an expert judge tasked with evaluating whether an AI-generated answer "
"covers all relevant aspects of the query."
),
(
"The answer should include all major points from the CONTEXT "
"and address the user's "
"query "
"fully."
),
),
}
results = {}
for name, (task_intro, eval_criteria) in metric_configs.items():
try:
metric = GEval(
task_introduction=task_intro,
evaluation_criteria=eval_criteria,
model=judge_model,
name=f"G-Eval {name.capitalize()}",
)
eval_input = f"""
OUTPUT: {output}
CONTEXT: {context}
"""
score_result = await metric.ascore(eval_input)
results[name] = {
"score": score_result.value,
"reason": score_result.reason,
"failed": score_result.scoring_failed,
}
except Exception as e:
logger.warning(f"G-Eval {name} failed: {e}")
results[name] = {"score": 0.0, "reason": str(e), "failed": True}
return results