Add cybergym
Browse files- meta_data.py +6 -0
- results.json +32 -0
meta_data.py
CHANGED
|
@@ -85,4 +85,10 @@ LEADERBOARD_MD['SWE-bench-verified'] = """This is a human-validated subset of SW
|
|
| 85 |
|
| 86 |
Paper: https://openai.com/index/introducing-swe-bench-verified/
|
| 87 |
Code: https://github.com/swe-bench/SWE-bench
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
"""
|
|
|
|
| 85 |
|
| 86 |
Paper: https://openai.com/index/introducing-swe-bench-verified/
|
| 87 |
Code: https://github.com/swe-bench/SWE-bench
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
LEADERBOARD_MD['CyberGym'] = """This is a large-scale and high-quality cybersecurity evaluation framework featuring 1,507 real-world vulnerabilities found and patched across 188 large software projects.
|
| 91 |
+
|
| 92 |
+
Paper: https://arxiv.org/abs/2506.02548
|
| 93 |
+
Code: https://github.com/sunblaze-ucb/cybergym
|
| 94 |
"""
|
results.json
CHANGED
|
@@ -797,6 +797,38 @@
|
|
| 797 |
"RAG + SWE-Llama 13B": 1.20,
|
| 798 |
"RAG + ChatGPT 3.5": 0.40
|
| 799 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 800 |
}
|
| 801 |
}
|
| 802 |
}
|
|
|
|
| 797 |
"RAG + SWE-Llama 13B": 1.20,
|
| 798 |
"RAG + ChatGPT 3.5": 0.40
|
| 799 |
}
|
| 800 |
+
},
|
| 801 |
+
"CyberGym": {
|
| 802 |
+
"% Reproducing Target Vuln.": {
|
| 803 |
+
"OpenHands + Claude-Sonnet-4": 17.85,
|
| 804 |
+
"OpenHands + Claude-3.7-Sonnet": 11.94,
|
| 805 |
+
"OpenHands + GPT-4.1": 9.36,
|
| 806 |
+
"Cybench + GPT-4.1": 8.96,
|
| 807 |
+
"Codex + GPT-4.1": 7.37,
|
| 808 |
+
"ENiGMA + GPT-4.1": 7.23,
|
| 809 |
+
"OpenHands + Gemini-2.5-Flash": 4.84,
|
| 810 |
+
"OpenHands + DeepSeek-V3": 3.58,
|
| 811 |
+
"OpenHands + o4-mini": 2.46,
|
| 812 |
+
"OpenHands + R2E-Gym-32B": 1.99,
|
| 813 |
+
"OpenHands + Qwen3-235B-A22B": 1.86,
|
| 814 |
+
"OpenHands + OpenHands-LM-32B": 1.66,
|
| 815 |
+
"OpenHands + SWE-Gym-32B": 0.07
|
| 816 |
+
},
|
| 817 |
+
"% Finding Post-Patch Vuln.": {
|
| 818 |
+
"OpenHands + Claude-Sonnet-4": 1.99,
|
| 819 |
+
"OpenHands + Claude-3.7-Sonnet": 2.19,
|
| 820 |
+
"OpenHands + GPT-4.1": 1.26,
|
| 821 |
+
"Cybench + GPT-4.1": 2.26,
|
| 822 |
+
"Codex + GPT-4.1": 1.19,
|
| 823 |
+
"ENiGMA + GPT-4.1": 1.92,
|
| 824 |
+
"OpenHands + Gemini-2.5-Flash": 0.80,
|
| 825 |
+
"OpenHands + DeepSeek-V3": 0.66,
|
| 826 |
+
"OpenHands + o4-mini": 0.07,
|
| 827 |
+
"OpenHands + R2E-Gym-32B": 0.60,
|
| 828 |
+
"OpenHands + Qwen3-235B-A22B": 0.33,
|
| 829 |
+
"OpenHands + OpenHands-LM-32B": 0.33,
|
| 830 |
+
"OpenHands + SWE-Gym-32B": 0.07
|
| 831 |
+
}
|
| 832 |
}
|
| 833 |
}
|
| 834 |
}
|