Commit
·
9da9431
1
Parent(s):
ad604a4
refine leaderboard
Browse files- app.py +3 -2
- content.py +7 -4
app.py
CHANGED
|
@@ -8,7 +8,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
|
|
| 8 |
|
| 9 |
# InfoStrings
|
| 10 |
from scorer import question_scorer
|
| 11 |
-
from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT,
|
| 12 |
|
| 13 |
TOKEN = os.environ.get("TOKEN", None)
|
| 14 |
|
|
@@ -58,7 +58,8 @@ with demo:
|
|
| 58 |
elem_id="citation-button",
|
| 59 |
lines=10,
|
| 60 |
)
|
| 61 |
-
|
|
|
|
| 62 |
with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
|
| 63 |
human_leaderboard_table_test = gr.components.Dataframe(
|
| 64 |
value=human_eval_dataframe_test, datatype=TYPES, interactive=False,
|
|
|
|
| 8 |
|
| 9 |
# InfoStrings
|
| 10 |
from scorer import question_scorer
|
| 11 |
+
from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, LEADERBOARD_HTML, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
|
| 12 |
|
| 13 |
TOKEN = os.environ.get("TOKEN", None)
|
| 14 |
|
|
|
|
| 58 |
elem_id="citation-button",
|
| 59 |
lines=10,
|
| 60 |
)
|
| 61 |
+
# gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text")
|
| 62 |
+
gr.HTML(LEADERBOARD_HTML)
|
| 63 |
with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
|
| 64 |
human_leaderboard_table_test = gr.components.Dataframe(
|
| 65 |
value=human_eval_dataframe_test, datatype=TYPES, interactive=False,
|
content.py
CHANGED
|
@@ -9,11 +9,14 @@ LINKS = """
|
|
| 9 |
"""
|
| 10 |
|
| 11 |
INTRODUCTION_TEXT = """
|
| 12 |
-
Online Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains.
|
| 13 |
-
|
| 14 |
-
|
| 15 |
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
| 17 |
"""
|
| 18 |
|
| 19 |
SUBMISSION_TEXT = """
|
|
|
|
| 9 |
"""
|
| 10 |
|
| 11 |
INTRODUCTION_TEXT = """
|
| 12 |
+
Online Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains.
|
| 13 |
+
Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (1–5 steps), Medium (6–10 steps), and Hard (11+ steps).
|
| 14 |
+
"""
|
| 15 |
|
| 16 |
+
LEADERBOARD_TEXT = """
|
| 17 |
+
### Leaderboard
|
| 18 |
+
We maintain two leaderboards—one for automated evaluation and another for human evaluation.
|
| 19 |
+
All submissions will be auto-evaluated internally, and if human evaluation results are provided, a subset will be selected for rigorous spot-check verification.
|
| 20 |
"""
|
| 21 |
|
| 22 |
SUBMISSION_TEXT = """
|