Spaces:

IndraneelKumar
/

Search_Engine

Sleeping

Search_Engine / src /api /services /providers /utils /evaluation_metrics.py

IndraneelKumar

Initial search engine commit

266d7bc about 1 month ago

3.86 kB

	from opik.evaluation import models
	from opik.evaluation.metrics import GEval

	from src.config import settings
	from src.utils.logger_util import setup_logging

	logger = setup_logging()

	# -----------------------
	# Evaluation helper
	# -----------------------


	async def evaluate_metrics(output: str, context: str) -> dict:
	"""Evaluate multiple metrics for a given LLM output.
	Metrics included: faithfulness, coherence, completeness.

	Args:
	output (str): The LLM-generated output to evaluate.
	context (str): The context used to generate the output.

	Returns:
	dict: A dictionary with metric names as keys and their evaluation results as values.

	"""
	settings.openai.api_key = None
	logger.info(f"OpenAI key is not set: {settings.openai.api_key is None}")

	if not output.strip():
	logger.warning("Output is empty. Skipping evaluation.")
	return {
	"faithfulness": {"score": 0.0, "reason": "Empty output", "failed": True},
	"coherence": {"score": 0.0, "reason": "Empty output", "failed": True},
	"completeness": {"score": 0.0, "reason": "Empty output", "failed": True},
	}

	if not getattr(settings.openai, "api_key", None):
	logger.info("OpenAI API key not set. Skipping metrics evaluation.")
	return {
	"faithfulness": {"score": None, "reason": "Skipped – no API key", "failed": True},
	"coherence": {"score": None, "reason": "Skipped – no API key", "failed": True},
	"completeness": {"score": None, "reason": "Skipped – no API key", "failed": True},
	}

	judge_model = models.LiteLLMChatModel(
	model_name="gpt-4o", # gpt-4o, gpt-5-mini
	api_key=settings.openai.api_key,
	)

	metric_configs = {
	"faithfulness": (
	(
	"You are an expert judge tasked with evaluating whether an AI-generated answer is "
	"faithful to the provided Substack excerpts."
	),
	(
	"The OUTPUT must not introduce new information and beyond "
	"what is contained in the CONTEXT. "
	"All claims in the OUTPUT should be directly supported by the CONTEXT."
	),
	),
	"coherence": (
	(
	"You are an expert judge tasked with evaluating whether an AI-generated answer is "
	"logically coherent."
	),
	"The answer should be well-structured, readable, and maintain consistent reasoning.",
	),
	"completeness": (
	(
	"You are an expert judge tasked with evaluating whether an AI-generated answer "
	"covers all relevant aspects of the query."
	),
	(
	"The answer should include all major points from the CONTEXT "
	"and address the user's "
	"query "
	"fully."
	),
	),
	}

	results = {}
	for name, (task_intro, eval_criteria) in metric_configs.items():
	try:
	metric = GEval(
	task_introduction=task_intro,
	evaluation_criteria=eval_criteria,
	model=judge_model,
	name=f"G-Eval {name.capitalize()}",
	)

	eval_input = f"""
	OUTPUT: {output}
	CONTEXT: {context}
	"""

	score_result = await metric.ascore(eval_input)

	results[name] = {
	"score": score_result.value,
	"reason": score_result.reason,
	"failed": score_result.scoring_failed,
	}

	except Exception as e:
	logger.warning(f"G-Eval {name} failed: {e}")
	results[name] = {"score": 0.0, "reason": str(e), "failed": True}

	return results