|
|
import os |
|
|
import gradio as gr |
|
|
import requests |
|
|
import pandas as pd |
|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
|
|
|
|
|
|
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BasicAgent: |
|
|
def __init__(self): |
|
|
print("Loading HF model...") |
|
|
model_name = "Qwen/Qwen2.5-1.5B-Instruct" |
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
self.model = AutoModelForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
torch_dtype=torch.float32, |
|
|
device_map="auto" |
|
|
) |
|
|
|
|
|
def __call__(self, question: str) -> str: |
|
|
|
|
|
prompt = f""" |
|
|
You are a strict exam solver. |
|
|
Give ONLY the final answer. No explanation. No extra words. |
|
|
|
|
|
Question: |
|
|
{question} |
|
|
|
|
|
Final Answer: |
|
|
""" |
|
|
|
|
|
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) |
|
|
|
|
|
output_ids = self.model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=80, |
|
|
temperature=0.1, |
|
|
do_sample=False |
|
|
) |
|
|
|
|
|
decoded = self.tokenizer.decode(output_ids[0], skip_special_tokens=True) |
|
|
|
|
|
|
|
|
if "Final Answer:" in decoded: |
|
|
decoded = decoded.split("Final Answer:")[-1].strip() |
|
|
|
|
|
return decoded |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_and_submit_all(profile: gr.OAuthProfile | None): |
|
|
|
|
|
space_id = os.getenv("SPACE_ID") |
|
|
|
|
|
if profile: |
|
|
username = f"{profile.username}" |
|
|
else: |
|
|
return "Please login to HuggingFace.", None |
|
|
|
|
|
api_url = DEFAULT_API_URL |
|
|
questions_url = f"{api_url}/questions" |
|
|
submit_url = f"{api_url}/submit" |
|
|
|
|
|
|
|
|
try: |
|
|
agent = BasicAgent() |
|
|
except Exception as e: |
|
|
return f"Error initializing agent: {e}", None |
|
|
|
|
|
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" |
|
|
|
|
|
|
|
|
try: |
|
|
response = requests.get(questions_url, timeout=20) |
|
|
response.raise_for_status() |
|
|
questions = response.json() |
|
|
except Exception as e: |
|
|
return f"Error fetching questions: {e}", None |
|
|
|
|
|
results_log = [] |
|
|
answers_payload = [] |
|
|
|
|
|
for q in questions: |
|
|
task_id = q["task_id"] |
|
|
text = q["question"] |
|
|
|
|
|
try: |
|
|
ans = agent(text) |
|
|
except Exception as e: |
|
|
ans = f"ERROR: {e}" |
|
|
|
|
|
answers_payload.append({"task_id": task_id, "submitted_answer": ans}) |
|
|
results_log.append({"Task ID": task_id, "Question": text, "Submitted Answer": ans}) |
|
|
|
|
|
submission_data = { |
|
|
"username": username.strip(), |
|
|
"agent_code": agent_code, |
|
|
"answers": answers_payload |
|
|
} |
|
|
|
|
|
|
|
|
try: |
|
|
response = requests.post(submit_url, json=submission_data, timeout=60) |
|
|
response.raise_for_status() |
|
|
result_data = response.json() |
|
|
|
|
|
final_msg = ( |
|
|
f"Submission Successful!\n" |
|
|
f"User: {result_data.get('username')}\n" |
|
|
f"Score: {result_data.get('score')}%\n" |
|
|
f"Correct: {result_data.get('correct_count')}/{result_data.get('total_attempted')}\n" |
|
|
f"Message: {result_data.get('message')}" |
|
|
) |
|
|
|
|
|
return final_msg, pd.DataFrame(results_log) |
|
|
|
|
|
except Exception as e: |
|
|
return f"Submission Failed: {e}", pd.DataFrame(results_log) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("# 🧠 GAIA Unit 4 Agent Evaluation Runner") |
|
|
gr.Markdown("Login → Run agent → Auto-submit to leaderboard") |
|
|
|
|
|
gr.LoginButton() |
|
|
|
|
|
run_btn = gr.Button("🚀 Run Evaluation & Submit All Answers") |
|
|
|
|
|
status_box = gr.Textbox(label="Status", lines=5) |
|
|
table = gr.DataFrame(label="Agent Answers") |
|
|
|
|
|
run_btn.click( |
|
|
run_and_submit_all, |
|
|
outputs=[status_box, table] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|