Spaces:
Running
Running
File size: 9,504 Bytes
61a383e 75bf67b 61a383e 75bf67b 61a383e 75bf67b 61a383e 75bf67b 61a383e 75bf67b 4903398 75bf67b 61a383e 75bf67b 61a383e 75bf67b 61a383e c564619 75bf67b 61a383e 75bf67b c564619 75bf67b 61a383e 75bf67b 61a383e c564619 61a383e 75bf67b 4903398 75bf67b 4903398 c564619 75bf67b 4903398 61a383e 661da28 61a383e 0434884 61a383e 661da28 61a383e f53d840 da199b7 61a383e f53d840 da199b7 f53d840 da199b7 f53d840 61a383e 2c97b8a 61a383e 48fb8a1 61a383e 48fb8a1 61a383e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 |
"""
Interface components for chat evaluation
"""
import gradio as gr
import pandas as pd
from src.analytics.chat_evaluator import ChatEvaluator
import json
import os
from typing import Dict, Any, List, Tuple
def get_evaluation_status(evaluator, force_reload=False):
"""
Get evaluation status as formatted string and refresh QA data
Args:
evaluator: ChatEvaluator instance
force_reload: If True, force reload data from dataset
Returns:
Status message, updated QA table and refresh message
"""
try:
# First, reset cache if forcing reload
if force_reload:
evaluator.reset_cache()
# Get status data
status = evaluator.get_evaluation_status(force_reload=force_reload)
# Get updated QA table
qa_table = get_qa_pairs_dataframe(evaluator, show_evaluated=False, force_reload=force_reload)
status_message = f"""
Total QA Pairs: {status['total_qa_pairs']}
Evaluated Pairs: {status['evaluated_pairs']}
Unevaluated Pairs: {status['unevaluated_pairs']}
Evaluated Conversations: {status['evaluated_conversations']}
"""
refresh_message = "Data refreshed successfully" if force_reload else ""
return status_message, qa_table, refresh_message
except Exception as e:
logger.error(f"Error getting evaluation status: {e}")
# Import pandas here to avoid circular imports
import pandas as pd
empty_df = pd.DataFrame(columns=["Conversation ID", "Question", "Answer", "Evaluated"])
return f"Error getting status: {str(e)}", empty_df, f"Error: {str(e)}"
def get_qa_pairs_dataframe(evaluator, show_evaluated=False, force_reload=False):
"""
Get QA pairs as DataFrame for the evaluation interface
Args:
evaluator: ChatEvaluator instance
show_evaluated: If True, show only evaluated pairs. If False, show all pairs
force_reload: If True, force reload from dataset
Returns:
DataFrame with QA pairs
"""
try:
# Get QA pairs
qa_pairs = evaluator.get_qa_pairs_for_evaluation(limit=100, force_reload=force_reload)
# Get annotations
annotations = evaluator.get_annotations(force_reload=force_reload)
evaluated_ids = {a.get("conversation_id") for a in annotations}
# Filter pairs based on show_evaluated flag
if show_evaluated:
# Show only evaluated pairs
qa_pairs = [qa for qa in qa_pairs if qa["conversation_id"] in evaluated_ids]
# Convert to DataFrame
if qa_pairs:
import pandas as pd
df = pd.DataFrame([
{
"Conversation ID": qa["conversation_id"],
"Question": qa["question"][:50] + "..." if len(qa["question"]) > 50 else qa["question"],
"Answer": qa["original_answer"][:100] + "..." if len(qa["original_answer"]) > 100 else qa["original_answer"],
"Evaluated": "Yes" if qa["conversation_id"] in evaluated_ids else "No"
}
for qa in qa_pairs
])
return df
else:
import pandas as pd
return pd.DataFrame(columns=["Conversation ID", "Question", "Answer", "Evaluated"])
except Exception as e:
logger.error(f"Error getting QA pairs dataframe: {e}")
import pandas as pd
return pd.DataFrame(columns=["Conversation ID", "Question", "Answer", "Evaluated"])
def load_qa_pair_for_evaluation(conversation_id: str, evaluator: ChatEvaluator) -> Tuple[str, str, str, int, int, int, int, int, str]:
"""
Load a QA pair for evaluation
Args:
conversation_id: ID of the conversation to load
evaluator: ChatEvaluator instance
Returns:
Tuple of (question, original_answer, improved_answer, accuracy, completeness,
relevance, clarity, legal_correctness, notes)
"""
# Get all QA pairs
qa_pairs = evaluator.get_qa_pairs_for_evaluation(limit=1000)
# Get existing annotation if any
annotation = evaluator.get_annotation(conversation_id)
if annotation:
return (
annotation.get("question", ""),
annotation.get("original_answer", ""), # Changed from original_answer
annotation.get("improved_answer", ""), # Changed from improved_answer
annotation.get("ratings", {}).get("accuracy", 1),
annotation.get("ratings", {}).get("completeness", 1),
annotation.get("ratings", {}).get("relevance", 1),
annotation.get("ratings", {}).get("clarity", 1),
annotation.get("ratings", {}).get("legal_correctness", 1),
annotation.get("notes", "")
)
# If no annotation exists, find the conversation in QA pairs
for qa_pair in qa_pairs:
if qa_pair.get("conversation_id") == conversation_id:
return (
qa_pair.get("question", ""),
qa_pair.get("original_answer", ""), # Changed from answer
"", # Empty improved answer
1, # Default ratings
1,
1,
1,
1,
"" # Empty notes
)
return "", "", "", 1, 1, 1, 1, 1, "" # Return empty values if not found
def save_evaluation(
conversation_id: str,
question: str,
original_answer: str,
improved_answer: str,
accuracy: int,
completeness: int,
relevance: int,
clarity: int,
legal_correctness: int,
notes: str,
evaluator: ChatEvaluator
) -> str:
"""
Save evaluation to file and dataset
Args:
evaluator: ChatEvaluator instance
conversation_id: ID of the conversation
question: User question
original_answer: Original bot answer
improved_answer: Improved answer
accuracy: Rating for factual accuracy (1-5)
completeness: Rating for completeness (1-5)
relevance: Rating for relevance (1-5)
clarity: Rating for clarity (1-5)
legal_correctness: Rating for legal correctness (1-5)
notes: Evaluator notes
Returns:
Status message
"""
# Create ratings dictionary
ratings = {
"accuracy": accuracy,
"completeness": completeness,
"relevance": relevance,
"clarity": clarity,
"legal_correctness": legal_correctness
}
# Save annotation
success, message = evaluator.save_annotation(
conversation_id=conversation_id,
question=question,
original_answer=original_answer,
improved_answer=improved_answer,
ratings=ratings,
notes=notes
)
return message
def generate_evaluation_report_html(evaluator: ChatEvaluator) -> str:
"""
Generate HTML report of evaluation metrics
Args:
evaluator: ChatEvaluator instance
Returns:
HTML string with report
"""
report = evaluator.generate_evaluation_report()
if report["total_evaluations"] == 0:
return "<p>No evaluations available yet.</p>"
# Format criteria averages
criteria_html = ""
for criterion, avg in report["criteria_averages"].items():
# Calculate stars representation (1-5)
stars = "β
" * int(avg) + "β" * (5 - int(avg))
criteria_html += f"""
<tr>
<td>{criterion.capitalize()}</td>
<td>{avg:.2f}/5.0</td>
<td>{stars}</td>
</tr>
"""
# Overall stars representation
overall_stars = "β
" * int(report["overall_average"]) + "β" * (5 - int(report["overall_average"]))
html = f"""
<div style="padding: 15px; border: 1px solid #ccc; border-radius: 5px; margin-top: 10px;">
<h3>Evaluation Report</h3>
<p><strong>Total Evaluations:</strong> {report["total_evaluations"]}</p>
<p><strong>Overall Average Rating:</strong> {report["overall_average"]:.2f}/5.0 {overall_stars}</p>
<p><strong>Improvement Rate:</strong> {report["improvement_rate"]:.1f}% of responses were improved</p>
<h4>Criteria Ratings:</h4>
<table style="width: 100%; border-collapse: collapse;">
<tr>
<th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Criterion</th>
<th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Average Score</th>
<th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Rating</th>
</tr>
{criteria_html}
</table>
</div>
"""
return html
def export_training_data_action(evaluator: ChatEvaluator, min_rating: int, output_file: str) -> str:
"""
Action for exporting training data
Args:
evaluator: ChatEvaluator instance
min_rating: Minimum average rating (1-5)
output_file: Output file path
Returns:
Status message
"""
if not output_file:
output_file = os.path.join(os.path.dirname(evaluator.annotations_dir), "training_data.jsonl")
success, message = evaluator.export_training_data(output_file, min_rating)
return message
|