Update app.py
Browse files
app.py
CHANGED
|
@@ -46,6 +46,43 @@ if "evaluation_params" not in st.session_state:
|
|
| 46 |
if "show_results" not in st.session_state:
|
| 47 |
st.session_state.show_results = False
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
def run_evaluation_sync(request: EvaluationRequest):
|
| 50 |
"""Run evaluation synchronously with proper event loop handling"""
|
| 51 |
try:
|
|
@@ -481,23 +518,6 @@ def build_request_object(questions: List[str], ground_truths: List[str], model_r
|
|
| 481 |
|
| 482 |
return request
|
| 483 |
|
| 484 |
-
def read_json_file(uploaded_file):
|
| 485 |
-
"""Read JSON file with proper error handling for Spaces"""
|
| 486 |
-
try:
|
| 487 |
-
# For Spaces environment, use file uploader content directly
|
| 488 |
-
if hasattr(uploaded_file, 'getvalue'):
|
| 489 |
-
content = uploaded_file.getvalue()
|
| 490 |
-
if isinstance(content, bytes):
|
| 491 |
-
content = content.decode('utf-8')
|
| 492 |
-
return json.loads(content)
|
| 493 |
-
else:
|
| 494 |
-
# For local files
|
| 495 |
-
with open(uploaded_file, 'r', encoding='utf-8') as f:
|
| 496 |
-
return json.load(f)
|
| 497 |
-
except Exception as e:
|
| 498 |
-
st.error(f"Error reading JSON file: {e}")
|
| 499 |
-
return None
|
| 500 |
-
|
| 501 |
def main():
|
| 502 |
st.title("🤖 LMVal: Multi-Metric LLM Evaluation")
|
| 503 |
st.markdown("Advanced RAG pipeline evaluation using LangGraph and Groq/OpenAI")
|
|
@@ -639,44 +659,72 @@ def main():
|
|
| 639 |
|
| 640 |
if uploaded_file is not None:
|
| 641 |
try:
|
| 642 |
-
#
|
| 643 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 644 |
|
| 645 |
-
if data:
|
| 646 |
-
#
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
|
| 662 |
-
if questions_list:
|
| 663 |
-
st.success(f"Loaded {len(questions_list)} items from JSON")
|
| 664 |
-
|
| 665 |
-
# Show preview
|
| 666 |
-
with st.expander("Preview loaded data"):
|
| 667 |
-
preview_data = {
|
| 668 |
-
"questions": questions_list[:3] + ["..."] if len(questions_list) > 3 else questions_list,
|
| 669 |
-
"ground_truths": truths_list[:3] + ["..."] if len(truths_list) > 3 else truths_list,
|
| 670 |
-
"model_responses": responses_list[:3] + ["..."] if responses_list and len(responses_list) > 3 else responses_list,
|
| 671 |
-
"contexts": contexts_list[:3] + ["..."] if contexts_list and len(contexts_list) > 3 else contexts_list
|
| 672 |
-
}
|
| 673 |
-
st.json(preview_data)
|
| 674 |
-
else:
|
| 675 |
-
st.warning("No valid data found in the JSON file")
|
| 676 |
-
|
| 677 |
except Exception as e:
|
| 678 |
st.error(f"Error processing JSON file: {e}")
|
| 679 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 680 |
# Run evaluation button
|
| 681 |
run_button = st.button("▶️ Run Evaluation", use_container_width=True,
|
| 682 |
disabled=st.session_state.evaluation_in_progress)
|
|
@@ -852,7 +900,7 @@ def main():
|
|
| 852 |
st.rerun()
|
| 853 |
|
| 854 |
# Clear all history button
|
| 855 |
-
if st.button("Clear All History
|
| 856 |
st.session_state.evaluation_history = []
|
| 857 |
st.success("All history cleared")
|
| 858 |
st.rerun()
|
|
|
|
| 46 |
if "show_results" not in st.session_state:
|
| 47 |
st.session_state.show_results = False
|
| 48 |
|
| 49 |
+
def is_running_on_spaces():
|
| 50 |
+
"""Check if we're running on Hugging Face Spaces"""
|
| 51 |
+
return os.environ.get('SPACES_APP_TYPE') is not None
|
| 52 |
+
|
| 53 |
+
def create_sample_data():
|
| 54 |
+
"""Create sample data for demonstration"""
|
| 55 |
+
return {
|
| 56 |
+
"questions": [
|
| 57 |
+
"What is the capital of France?",
|
| 58 |
+
"How does photosynthesis work?",
|
| 59 |
+
"What is the theory of relativity?",
|
| 60 |
+
"What is the main ingredient in guacamole?",
|
| 61 |
+
"Who developed the theory of relativity?"
|
| 62 |
+
],
|
| 63 |
+
"ground_truths": [
|
| 64 |
+
"The capital of France is Paris.",
|
| 65 |
+
"Photosynthesis is the process by which plants convert sunlight into energy.",
|
| 66 |
+
"The theory of relativity was developed by Albert Einstein.",
|
| 67 |
+
"The main ingredient in guacamole is avocado.",
|
| 68 |
+
"Albert Einstein developed the theory of relativity."
|
| 69 |
+
],
|
| 70 |
+
"model_responses": [
|
| 71 |
+
"Paris is the capital city of France.",
|
| 72 |
+
"Plants use sunlight to create energy through photosynthesis.",
|
| 73 |
+
"Einstein developed the theory of relativity.",
|
| 74 |
+
"The main ingredient in guacamole is tomato.",
|
| 75 |
+
"Isaac Newton developed the theory of relativity."
|
| 76 |
+
],
|
| 77 |
+
"contexts": [
|
| 78 |
+
"France is a country in Western Europe with Paris as its capital.",
|
| 79 |
+
"Photosynthesis is a biological process used by plants to create energy.",
|
| 80 |
+
"Albert Einstein was a physicist who developed the theory of relativity.",
|
| 81 |
+
"Guacamole is an avocado-based dip first developed in Mexico.",
|
| 82 |
+
"Albert Einstein was a German-born theoretical physicist who developed the theory of relativity."
|
| 83 |
+
]
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
def run_evaluation_sync(request: EvaluationRequest):
|
| 87 |
"""Run evaluation synchronously with proper event loop handling"""
|
| 88 |
try:
|
|
|
|
| 518 |
|
| 519 |
return request
|
| 520 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 521 |
def main():
|
| 522 |
st.title("🤖 LMVal: Multi-Metric LLM Evaluation")
|
| 523 |
st.markdown("Advanced RAG pipeline evaluation using LangGraph and Groq/OpenAI")
|
|
|
|
| 659 |
|
| 660 |
if uploaded_file is not None:
|
| 661 |
try:
|
| 662 |
+
# Read content directly from the uploaded file
|
| 663 |
+
content = uploaded_file.getvalue()
|
| 664 |
+
if isinstance(content, bytes):
|
| 665 |
+
content = content.decode('utf-8')
|
| 666 |
+
|
| 667 |
+
data = json.loads(content)
|
| 668 |
+
|
| 669 |
+
# Handle different JSON structures
|
| 670 |
+
questions_list = []
|
| 671 |
+
truths_list = []
|
| 672 |
+
responses_list = []
|
| 673 |
+
contexts_list = []
|
| 674 |
|
| 675 |
+
if isinstance(data, dict):
|
| 676 |
+
# Standard format with separate arrays
|
| 677 |
+
questions_list = data.get("questions", [])
|
| 678 |
+
truths_list = data.get("ground_truths", [])
|
| 679 |
+
responses_list = data.get("model_responses", [])
|
| 680 |
+
contexts_list = data.get("contexts", [])
|
| 681 |
+
elif isinstance(data, list):
|
| 682 |
+
# List of question objects
|
| 683 |
+
for item in data:
|
| 684 |
+
if isinstance(item, dict):
|
| 685 |
+
questions_list.append(item.get("question", ""))
|
| 686 |
+
truths_list.append(item.get("ground_truth", ""))
|
| 687 |
+
responses_list.append(item.get("model_response", ""))
|
| 688 |
+
contexts_list.append(item.get("context", ""))
|
| 689 |
+
|
| 690 |
+
if questions_list:
|
| 691 |
+
st.success(f"Loaded {len(questions_list)} items from JSON")
|
| 692 |
+
|
| 693 |
+
# Show preview
|
| 694 |
+
with st.expander("Preview loaded data"):
|
| 695 |
+
preview_data = {
|
| 696 |
+
"questions": questions_list[:3] + ["..."] if len(questions_list) > 3 else questions_list,
|
| 697 |
+
"ground_truths": truths_list[:3] + ["..."] if len(truths_list) > 3 else truths_list,
|
| 698 |
+
"model_responses": responses_list[:3] + ["..."] if responses_list and len(responses_list) > 3 else responses_list,
|
| 699 |
+
"contexts": contexts_list[:3] + ["..."] if contexts_list and len(contexts_list) > 3 else contexts_list
|
| 700 |
+
}
|
| 701 |
+
st.json(preview_data)
|
| 702 |
+
else:
|
| 703 |
+
st.warning("No valid data found in the JSON file")
|
| 704 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
except Exception as e:
|
| 706 |
st.error(f"Error processing JSON file: {e}")
|
| 707 |
|
| 708 |
+
# Add sample data button for Spaces
|
| 709 |
+
if is_running_on_spaces() and not questions_list:
|
| 710 |
+
if st.button("📋 Load Sample Data", help="Load sample data for testing"):
|
| 711 |
+
sample_data = create_sample_data()
|
| 712 |
+
questions_list = sample_data["questions"]
|
| 713 |
+
truths_list = sample_data["ground_truths"]
|
| 714 |
+
responses_list = sample_data["model_responses"]
|
| 715 |
+
contexts_list = sample_data["contexts"]
|
| 716 |
+
|
| 717 |
+
st.success("Sample data loaded successfully!")
|
| 718 |
+
|
| 719 |
+
# Show preview
|
| 720 |
+
with st.expander("Preview sample data"):
|
| 721 |
+
st.json({
|
| 722 |
+
"questions": questions_list,
|
| 723 |
+
"ground_truths": truths_list,
|
| 724 |
+
"model_responses": responses_list,
|
| 725 |
+
"contexts": contexts_list
|
| 726 |
+
})
|
| 727 |
+
|
| 728 |
# Run evaluation button
|
| 729 |
run_button = st.button("▶️ Run Evaluation", use_container_width=True,
|
| 730 |
disabled=st.session_state.evaluation_in_progress)
|
|
|
|
| 900 |
st.rerun()
|
| 901 |
|
| 902 |
# Clear all history button
|
| 903 |
+
if st.button("Clear All History", use_container_width=True, type="secondary"):
|
| 904 |
st.session_state.evaluation_history = []
|
| 905 |
st.success("All history cleared")
|
| 906 |
st.rerun()
|