Spaces:

NIRAJz
/

LMVal-Multi-Metric-LLM-Evaluation

Running

File size: 1,251 Bytes

6d55fec

from langchain.tools import tool
from typing import Dict, Any
import time

@tool
def evaluate_response(question: str, ground_truth: str, response: str, metric: str, 

                     chain: Any, context: str = None) -> Dict[str, Any]:
    """Evaluate a response for a specific metric using LangChain"""
    start_time = time.time()
    
    try:
        # Prepare input based on metric type
        input_data = {
            "question": question,
            "ground_truth": ground_truth,
            "response": response
        }
        
        # Add context for context-based metrics (even if empty)
        if metric in ["context_precision", "context_recall"]:
            input_data["context"] = context if context else "No context provided."
        
        # Use invoke() instead of direct call to fix the tool calling issue
        result = chain.invoke(input_data)
        
        processing_time = time.time() - start_time
        result["processing_time"] = processing_time
        
        return result
    except Exception as e:
        return {
            "score": 0,
            "explanation": f"Evaluation failed: {str(e)}",
            "processing_time": time.time() - start_time
        }