File size: 5,024 Bytes
da40f6e
10e9b7d
da40f6e
 
 
 
 
 
e80aab9
da40f6e
 
 
31243f4
 
da40f6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c4371f
da40f6e
 
 
 
 
 
 
 
 
 
 
 
3c4371f
da40f6e
 
 
 
 
 
 
 
 
e80aab9
31243f4
da40f6e
 
 
 
31243f4
da40f6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31243f4
da40f6e
 
 
 
31243f4
da40f6e
 
 
e80aab9
da40f6e
 
 
 
 
e80aab9
da40f6e
 
7d65c66
da40f6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d65c66
da40f6e
 
 
7d65c66
da40f6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# app.py β€” Final GAIA Assignment Template (Enhanced)

import streamlit as st
from smolagents import CodeAgent, DuckDuckGoSearchTool, PythonREPLTool, HfApiModel
from huggingface_hub import login
import json
import time
import os

# =========================
# 1. Define the GAIA Agent
# =========================
class BasicAgent:
    def __init__(self):
        st.write("πŸ”§ Initializing enhanced GAIA Agent...")

        # Core model from Hugging Face
        self.model = HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct")

        # Tools for reasoning and search
        self.tools = [
            DuckDuckGoSearchTool(),
            PythonREPLTool()
        ]

        # Create a CodeAgent instance
        self.agent = CodeAgent(
            tools=self.tools,
            model=self.model,
            name="GAIA_Level1_Agent",
            description="Hybrid reasoning agent using web + code execution to answer GAIA L1 questions.",
            max_steps=5
        )

    def sanitize(self, text: str) -> str:
        """Clean and simplify final outputs for benchmark scoring."""
        if not text:
            return ""
        text = text.strip()
        for prefix in ["FINAL ANSWER:", "Final Answer:", "Answer:", "answer:"]:
            if text.startswith(prefix):
                text = text[len(prefix):].strip()
        if text.startswith('"') and text.endswith('"'):
            text = text[1:-1]
        text = " ".join(text.split())
        return text

    def __call__(self, question: str) -> str:
        """Run the agent on a single GAIA question."""
        st.write(f"πŸ€– Running agent on: {question[:80]}...")
        prompt = (
            "You are a concise reasoning agent. "
            "Use your tools to find accurate answers. "
            "Always return only the final answer (no explanations).\n\n"
            f"Question: {question}"
        )

        try:
            response = self.agent.run(prompt)
            clean_answer = self.sanitize(response)
            st.write(f"βœ… Final Answer: {clean_answer}")
            return clean_answer or "N/A"
        except Exception as e:
            st.error(f"⚠️ Agent failed: {e}")
            return "N/A"


# =======================================
# 2. Streamlit UI and GAIA Leaderboard
# =======================================
st.set_page_config(page_title="GAIA Final Assignment", layout="centered")

st.title("πŸ€– GAIA Benchmark Final Assignment")
st.markdown(
    """
Welcome to your **Final Assignment** for the Agents course!

This app evaluates your custom agent on a subset of **GAIA Level 1** benchmark questions.
To pass and earn your certificate πŸ…, your agent must score **β‰₯ 30% accuracy**.

---

### 🧠 Steps
1. Log in to your **Hugging Face** account.  
2. Run your **agent** on the GAIA dataset.  
3. Automatically submit your results for scoring.

---
"""
)

# =========================
# 3. Login Section
# =========================
hf_token = st.text_input("πŸ”‘ Enter your Hugging Face access token:", type="password")
if st.button("Login to Hugging Face"):
    try:
        login(token=hf_token)
        st.success("βœ… Logged in successfully!")
    except Exception as e:
        st.error(f"Login failed: {e}")

# =========================
# 4. Load GAIA Questions
# =========================
if st.button("🧩 Load GAIA Dataset"):
    st.info("Fetching 20 GAIA Level 1 questions...")
    os.system("wget -q https://huggingface.co/spaces/agents-course/Final_Assignment_Template/resolve/main/questions.json -O questions.json")
    st.success("βœ… Dataset loaded!")

# =========================
# 5. Run Evaluation
# =========================
if st.button("πŸš€ Run Evaluation & Submit All Answers"):
    if not os.path.exists("questions.json"):
        st.warning("Please load the GAIA dataset first.")
    else:
        with open("questions.json", "r") as f:
            data = json.load(f)
        questions = data["questions"]

        agent = BasicAgent()
        results = {}

        for i, q in enumerate(questions):
            st.write(f"### Question {i+1}:")
            st.write(q)
            ans = agent(q)
            results[q] = ans
            time.sleep(1)

        # Save answers
        with open("answers.json", "w") as f:
            json.dump(results, f, indent=2)
        st.success("βœ… All questions answered and saved as answers.json")

        # Auto-submit via huggingface CLI (if supported)
        st.info("πŸ“€ Submitting answers to GAIA leaderboard...")
        os.system("python3 -m smolagents.eval_gaia submit answers.json")
        st.success("πŸŽ‰ Submission complete! Check your score on the leaderboard.")

# =========================
# 6. Notes
# =========================
st.markdown(
    """
---
### ℹ️ Notes
- You can edit the agent logic inside the `BasicAgent` class to boost performance.  
- Use more reasoning, examples, or API calls for higher accuracy.  
- Make your Space **public** before submitting.

Good luck on the GAIA leaderboard! 🌍
"""
)