|
|
from langchain.tools import tool
|
|
|
from typing import Dict, Any
|
|
|
import time
|
|
|
|
|
|
@tool
|
|
|
def evaluate_response(question: str, ground_truth: str, response: str, metric: str,
|
|
|
chain: Any, context: str = None) -> Dict[str, Any]:
|
|
|
"""Evaluate a response for a specific metric using LangChain"""
|
|
|
start_time = time.time()
|
|
|
|
|
|
try:
|
|
|
|
|
|
input_data = {
|
|
|
"question": question,
|
|
|
"ground_truth": ground_truth,
|
|
|
"response": response
|
|
|
}
|
|
|
|
|
|
|
|
|
if metric in ["context_precision", "context_recall"]:
|
|
|
input_data["context"] = context if context else "No context provided."
|
|
|
|
|
|
|
|
|
result = chain.invoke(input_data)
|
|
|
|
|
|
processing_time = time.time() - start_time
|
|
|
result["processing_time"] = processing_time
|
|
|
|
|
|
return result
|
|
|
except Exception as e:
|
|
|
return {
|
|
|
"score": 0,
|
|
|
"explanation": f"Evaluation failed: {str(e)}",
|
|
|
"processing_time": time.time() - start_time
|
|
|
} |