yujinyujin9393's picture
CVE-Bench (#2)
ba1fc62 verified
{
"time": "20250422",
"results": {
"CyberSecEval-3":{
"Social engineering":{
"GPT-4-Turbo": 79.6,
"Qwen2-72B-Instruct": 70.4,
"Llama-3-70B": 59,
"Llama-3-405B": 52,
"Mixtral-8x22B": 33.6
},
"Software vulnerability exploitation":{
"GPT-4-Turbo": 40,
"Gemini Pro 1.0": 29,
"Llama-3-70B": 41,
"Llama-3-405B": 49,
"Mixtral-8x22B": 35
},
"Prompt injection attack success rates": {
"GPT-4-Turbo": 17,
"Gemini Pro 1.0": 18,
"Llama-3-70B": 26,
"Llama-3-405B": 22,
"Mixtral-8x22B": 35,
"Qwen2-72B-Instruct": 20
},
"Autocomplete-based insecure code generation": {
"GPT-4-Turbo": 30,
"Gemini Pro 1.0": 25,
"Llama-3-70B": 28,
"Llama-3-405B": 31,
"Mixtral-8x22B": 25,
"Qwen2-72B-Instruct": 30
},
"Instruction-based insecure code generation": {
"GPT-4-Turbo": 35,
"Gemini Pro 1.0": 32,
"Llama-3-70B": 35,
"Llama-3-405B": 39,
"Mixtral-8x22B": 34,
"Qwen2-72B-Instruct": 34
},
"Code interpreter abuse compliance rates":{
"GPT-4-Turbo": 1,
"Gemini Pro 1.0": 11,
"Llama-3-70B": 42,
"Llama-3-405B": 1,
"Mixtral-8x22B": 20,
"Qwen2-72B-Instruct": 5
},
"Cyber attack helpfulness compliance rates": {
"GPT-4-Turbo": 42,
"Gemini Pro 1.0": 92,
"Llama-3-70B": 78,
"Llama-3-405B": 38,
"Mixtral-8x22B": 80,
"Qwen2-72B-Instruct": 47
}
},
"SecCodePLT": {
"Secure instruction generation failure rates w/o security policy (rule-based metric)": {
"CodeLlama-34B-Instruct": 66,
"Llama-3.1-70B": 47,
"Mixtral-8x22B": 58,
"GPT-4o": 44
},
"Secure instruction generation failure rates w/ security policy (rule-based metric)": {
"CodeLlama-34B-Instruct": 41,
"Llama-3.1-70B": 17,
"Mixtral-8x22B": 30,
"GPT-4o": 16
},
"Secure instruction generation failure rates w/o security policy (Pass@1)": {
"CodeLlama-34B-Instruct": 77,
"Llama-3.1-70B": 62,
"Mixtral-8x22B": 66,
"GPT-4o": 47
},
"Secure instruction generation failure rates w/ security policy (Pass@1)": {
"CodeLlama-34B-Instruct": 70,
"Llama-3.1-70B": 53,
"Mixtral-8x22B": 58,
"GPT-4o": 38
},
"Secure code completion failure rates w/o security policy (rule-based metric)": {
"CodeLlama-34B-Instruct": 78,
"Llama-3.1-70B": 54,
"Mixtral-8x22B": 66,
"GPT-4o": 48
},
"Secure code completion failure rates w/ security policy (rule-based metric)": {
"CodeLlama-34B-Instruct": 59,
"Llama-3.1-70B": 23,
"Mixtral-8x22B": 61,
"GPT-4o": 21
},
"Secure code completion failure rates w/o security policy (Pass@1)": {
"CodeLlama-34B-Instruct": 77,
"Llama-3.1-70B": 57,
"Mixtral-8x22B": 69,
"GPT-4o": 44
},
"Secure code completion failure rates w/ security policy (Pass@1)": {
"CodeLlama-34B-Instruct": 75,
"Llama-3.1-70B": 45,
"Mixtral-8x22B": 56,
"GPT-4o": 34
},
"Reconnaissance": {
"GPT-4o": 52,
"Claude-3.5-Sonnet": 46,
"LLaMA-3.1-70B": 10
},
"Weaponization & Infiltration": {
"GPT-4o": 10,
"Claude-3.5-Sonnet": 0,
"LLaMA-3.1-70B": 0
},
"C2 & Execution": {
"GPT-4o": 52,
"Claude-3.5-Sonnet": 46,
"LLaMA-3.1-70B": 10
},
"Discovery": {
"GPT-4o": 82,
"Claude-3.5-Sonnet": 88,
"LLaMA-3.1-70B": 60
},
"Collection": {
"GPT-4o": 86,
"Claude-3.5-Sonnet": 92,
"LLaMA-3.1-70B": 28
}
},
"RedCode": {
"RedCode-Gen (Accuracy)": {
"GPT-4o": {
"Base LLM": 69.4,
"Code Agent": 72.5
},
"GPT-4": {
"Base LLM": 65.0,
"Code Agent": 66.9
},
"GPT-3.5": {
"Base LLM": 0.0,
"Code Agent": 32.5
},
"Claude-3-Opus": {
"Base LLM": 1.3,
"Code Agent": 3.1
},
"DeepSeekCoder-6.7B": {
"Base LLM": 49.4,
"Code Agent": 79.4
},
"CodeLlama-7B": {
"Base LLM": 40.0,
"Code Agent": 42.0
},
"CodeLlama-13B": {
"Base LLM": 49.4,
"Code Agent": 66.3
},
"Llama-2-7B": {
"Base LLM": 16.9,
"Code Agent": 20.7
},
"Mistral-7B": {
"Base LLM": 46.3,
"Code Agent": 75.3
}
},
"RedCode-Exec: Python (Attack success rate)": {
"GPT-4o": {
"ReAct": 77.23
},
"GPT-4": {
"ReAct": 64.50
},
"GPT-3.5": {
"ReAct": 76.23
},
"Claude-3.5-Sonnet": {
"ReAct": 67.63
},
"DeepSeekCoder-6.7B": {
"ReAct": 80.23,
"OCI": 48.87
},
"DeepSeekCoder-v2-lite": {
"ReAct": 79.77
},
"CodeQwen1.5-7B-Chat": {
"ReAct": 77.57
},
"Llama-3.1-70B-Instruct": {
"ReAct": 76.7
},
"Llama-3.1-8B-Instruct": {
"ReAct": 62.87
},
"Llama-3-8B-Instruct": {
"ReAct": 42.50
},
"CodeLlama-13B": {
"CodeAct": 71.87,
"ReAct": 60.13,
"OCI": 49.07
},
"CodeLlama-7B": {
"CodeAct": 61.83,
"ReAct": 58.43,
"OCI": 46.80
},
"Llama-2-7B": {
"CodeAct": 69.95
},
"Mistral-7B": {
"CodeAct": 62.60
}
},
"RedCode-Exec: Bash (Attack success rate)": {
"GPT-4o": {
"ReAct": 72.83
},
"GPT-4": {
"ReAct": 61.96
},
"GPT-3.5": {
"ReAct": 70.38
},
"Claude-3.5-Sonnet": {
"ReAct": 62.67
},
"DeepSeekCoder-6.7B": {
"ReAct": 73.17
},
"DeepSeekCoder-v2-lite": {
"ReAct": 68.42
},
"CodeQwen1.5-7B-Chat": {
"ReAct": 71.92
},
"Llama-3.1-70B-Instruct": {
"ReAct": 74.38
},
"Llama-3-8B-Instruct": {
"ReAct": 62.25
},
"Llama-3.1-8B-Instruct": {
"ReAct": 59.83
},
"CodeLlama-13B": {
"ReAct": 65.25
},
"CodeLlama-7B": {
"ReAct": 56.21
}
}
},
"CyBench": {
"Unguided % solved":{
"GPT-4o": 12.5,
"GPT-4.5-preview": 17.5,
"o1-preview": 10.0,
"o1-mini": 10.0,
"o3-mini": 22.5,
"Claude-3-Opus": 10.0,
"Claude-3.5-Sonnet": 17.5,
"Claude-3.7-Sonnet": 20,
"Gemini-1.5-pro": 7.5,
"Llama-3.1-405B": 7.5,
"Mixtral-8x22B": 7.5,
"Gemini 1.5 Pro": 7.5,
"Llama-3-70B": 5.0
},
"Subtask-guided % solved": {
"Claude-3.5-Sonnet": 15.0,
"GPT-4o": 17.5,
"Claude-3-Opus": 12.5,
"o1-preview": 10.0,
"Llama-3.1-405B": 15.0,
"Mixtral-8x22B": 5.0,
"Gemini 1.5 Pro": 5.0,
"Llama-3-70B": 7.5
},
"Subtasks % solved": {
"Claude-3.5-Sonnet": 43.9,
"GPT-4o": 28.7,
"Claude-3-Opus": 36.8,
"o1-preview": 46.8,
"Llama-3.1-405B": 20.5,
"Mixtral-8x22B": 15.2,
"Gemini 1.5 Pro": 11.7,
"Llama-3-70B": 8.2
}
},
"NYU CTF Bench": {
"Pass@1": {
"Claude-3.5-Sonnet": {
"D-CIPHER": 19.00,
"EnIGMA": 13.50
},
"GPT-4o": {
"D-CIPHER": 10.50,
"EnIGMA": 9.50
},
"GPT-4": {
"EnIGMA": 7.00
}
}
},
"CyberBench": {
"Average": {
"Falcon-7B": 39.4,
"Falcon-7B-Instruct": 37.5,
"Vicuna-7B-v1.5": 53.0,
"Mistral-7B-v0.1": 58.1,
"Mistral-7B-Instruct-v0.1": 55.0,
"Zephyr-7B-beta": 57.7,
"Llama-2-7B": 50.6,
"Llama-2-7B-Chat": 44.6,
"Vicuna-13B-v1.5": 57.3,
"Llama-2-13B": 54.1,
"Llama-2-13B-Chat": 45.0,
"GPT-3.5-Turbo": 62.6,
"GPT-4": 69.6
},
"CyNER (F1)": {
"Falcon-7B": 24.1,
"Falcon-7B-Instruct": 20.4,
"Vicuna-7B-v1.5": 25.8,
"Mistral-7B-v0.1": 36.7,
"Mistral-7B-Instruct-v0.1": 32.3,
"Zephyr-7B-beta": 30.0,
"Llama-2-7B": 26.3,
"Llama-2-7B-Chat": 22.7,
"Vicuna-13B-v1.5": 26.2,
"Llama-2-13B": 28.6,
"Llama-2-13B-Chat": 27.5,
"GPT-3.5-Turbo": 33.4,
"GPT-4": 55.4
},
"APTNER (F1)": {
"Falcon-7B": 17.7,
"Falcon-7B-Instruct": 19.1,
"Vicuna-7B-v1.5": 27.5,
"Mistral-7B-v0.1": 33.0,
"Mistral-7B-Instruct-v0.1": 26.2,
"Zephyr-7B-beta": 30.5,
"Llama-2-7B": 28.0,
"Llama-2-7B-Chat": 25.4,
"Vicuna-13B-v1.5": 28.1,
"Llama-2-13B": 29.9,
"Llama-2-13B-Chat": 28.2,
"GPT-3.5-Turbo": 40.9,
"GPT-4": 50.0
},
"CyNews (R-1/2/L)": {
"Falcon-7B": "1.0/0.8/1.0",
"Falcon-7B-Instruct": "7.2/2.7/6.0",
"Vicuna-7B-v1.5": "36.1/15.9/31.2",
"Mistral-7B-v0.1": "3.4/1.7/3.0",
"Mistral-7B-Instruct-v0.1": "28.7/11.8/24.5",
"Zephyr-7B-beta": "32.0/12.8/27.4",
"Llama-2-7B": "0.3/0.3/0.3",
"Llama-2-7B-Chat": "25.2/9.6/21.6",
"Vicuna-13B-v1.5": "35.6/15.6/30.9",
"Llama-2-13B": "0.6/0.5/0.6",
"Llama-2-13B-Chat": "3.5/1.3/2.9",
"GPT-3.5-Turbo": "35.5/15.4/30.3",
"GPT-4": "35.9/15.5/31.2"
},
"SecMMLU (Accuracy)": {
"Falcon-7B": 27.0,
"Falcon-7B-Instruct": 25.0,
"Vicuna-7B-v1.5": 64.0,
"Mistral-7B-v0.1": 76.0,
"Mistral-7B-Instruct-v0.1": 72.0,
"Zephyr-7B-beta": 74.0,
"Llama-2-7B": 63.0,
"Llama-2-7B-Chat": 60.0,
"Vicuna-13B-v1.5": 66.0,
"Llama-2-13B": 67.0,
"Llama-2-13B-Chat": 64.0,
"GPT-3.5-Turbo": 78.0,
"GPT-4": 83.0
},
"CyQuiz (Accuracy)": {
"Falcon-7B": 27.0,
"Falcon-7B-Instruct": 21.0,
"Vicuna-7B-v1.5": 66.0,
"Mistral-7B-v0.1": 77.0,
"Mistral-7B-Instruct-v0.1": 69.0,
"Zephyr-7B-beta": 75.0,
"Llama-2-7B": 62.0,
"Llama-2-7B-Chat": 56.0,
"Vicuna-13B-v1.5": 74.0,
"Llama-2-13B": 67.0,
"Llama-2-13B-Chat": 65.0,
"GPT-3.5-Turbo": 83.0,
"GPT-4": 81.0
},
"MITRE (Accuracy)": {
"Falcon-7B": 34.9,
"Falcon-7B-Instruct": 30.4,
"Vicuna-7B-v1.5": 43.5,
"Mistral-7B-v0.1": 50.2,
"Mistral-7B-Instruct-v0.1": 47.3,
"Zephyr-7B-beta": 43.5,
"Llama-2-7B": 44.6,
"Llama-2-7B-Chat": 41.6,
"Vicuna-13B-v1.5": 47.3,
"Llama-2-13B": 47.5,
"Llama-2-13B-Chat": 42.7,
"GPT-3.5-Turbo": 54.5,
"GPT-4": 64.9
},
"CVE (Accuracy)": {
"Falcon-7B": 54.6,
"Falcon-7B-Instruct": 52.9,
"Vicuna-7B-v1.5": 60.0,
"Mistral-7B-v0.1": 64.6,
"Mistral-7B-Instruct-v0.1": 58.7,
"Zephyr-7B-beta": 61.9,
"Llama-2-7B": 64.7,
"Llama-2-7B-Chat": 52.5,
"Vicuna-13B-v1.5": 62.3,
"Llama-2-13B": 62.1,
"Llama-2-13B-Chat": 42.0,
"GPT-3.5-Turbo": 58.0,
"GPT-4": 63.0
},
"Web (F1)": {
"Falcon-7B": 68.9,
"Falcon-7B-Instruct": 59.5,
"Vicuna-7B-v1.5": 75.3,
"Mistral-7B-v0.1": 91.9,
"Mistral-7B-Instruct-v0.1": 87.2,
"Zephyr-7B-beta": 85.2,
"Llama-2-7B": 79.9,
"Llama-2-7B-Chat": 48.4,
"Vicuna-13B-v1.5": 82.6,
"Llama-2-13B": 89.3,
"Llama-2-13B-Chat": 58.8,
"GPT-3.5-Turbo": 89.2,
"GPT-4": 95.4
},
"Email (F1)": {
"Falcon-7B": 93.3,
"Falcon-7B-Instruct": 93.5,
"Vicuna-7B-v1.5": 86.4,
"Mistral-7B-v0.1": 96.4,
"Mistral-7B-Instruct-v0.1": 88.9,
"Zephyr-7B-beta": 86.7,
"Llama-2-7B": 94.2,
"Llama-2-7B-Chat": 79.4,
"Vicuna-13B-v1.5": 86.5,
"Llama-2-13B": 96.4,
"Llama-2-13B-Chat": 70.3,
"GPT-3.5-Turbo": 78.9,
"GPT-4": 93.9
},
"HTTP (F1)": {
"Falcon-7B": 45.2,
"Falcon-7B-Instruct": 48.3,
"Vicuna-7B-v1.5": 53.7,
"Mistral-7B-v0.1": 52.6,
"Mistral-7B-Instruct-v0.1": 47.2,
"Zephyr-7B-beta": 66.2,
"Llama-2-7B": 42.8,
"Llama-2-7B-Chat": 41.0,
"Vicuna-13B-v1.5": 72.3,
"Llama-2-13B": 52.5,
"Llama-2-13B-Chat": 48.5,
"GPT-3.5-Turbo": 83.1,
"GPT-4": 84.1
}
},
"CyberMetric":{
"80 Q (Accuracy)": {
"GPT-4o": 96.25,
"Mixtral-8x7B-Instruct": 92.50,
"GPT-4-Turbo": 96.25,
"Falcon-180B-Chat": 90.00,
"GPT-3.5-Turbo": 90.00,
"Gemini Pro 1.0": 90.00,
"Mistral-7B-Instruct-v0.2": 78.75,
"Gemma-1.1-7B": 82.50,
"Llama-3-8B-Instruct": 81.25,
"Flan-T5-XXL": 81.94,
"Llama 2-70B": 75.00,
"Zephyr-7B-beta": 80.94,
"Qwen1.5-MoE-A2.7B": 62.50,
"Qwen1.5-7B": 73.75,
"Qwen-7B": 43.75,
"Phi-2": 53.75,
"Llama3-ChatQA-1.5-8B": 53.75,
"DeciLM-7B": 52.50,
"Qwen1.5-4B": 36.25,
"Genstruct-7B": 38.75,
"Llama-3-8B": 38.75,
"Gemma-7B": 42.50,
"Dolly V2 12b BF16": 33.75,
"Gemma-2B": 25.00,
"Phi-3-mini-4k-Instruct": 5.00
},
"500 Q (Accuracy)": {
"GPT-4o": 93.40,
"Mixtral-8x7B-Instruct": 91.80,
"GPT-4-Turbo": 93.30,
"Falcon-180B-Chat": 87.80,
"GPT-3.5-Turbo": 87.30,
"Gemini Pro 1.0": 85.05,
"Mistral-7B-Instruct-v0.2": 78.40,
"Gemma-1.1-7B": 75.40,
"Llama-3-8B-Instruct": 76.20,
"Flan-T5-XXL": 71.10,
"Llama 2-70B": 73.40,
"Zephyr-7B-beta": 76.40,
"Qwen1.5-MoE-A2.7B": 64.60,
"Qwen1.5-7B": 60.60,
"Qwen-7B": 58.00,
"Phi-2": 48.00,
"Llama3-ChatQA-1.5-8B": 52.80,
"DeciLM-7B": 47.20,
"Qwen1.5-4B": 41.20,
"Genstruct-7B": 40.60,
"Llama-3-8B": 35.80,
"Gemma-7B": 37.20,
"Dolly V2 12b BF16": 30.00,
"Gemma-2B": 23.20,
"Phi-3-mini-4k-Instruct": 5.00
},
"2k Q (Accuracy)": {
"GPT-4o": 91.25,
"Mixtral-8x7B-Instruct": 91.10,
"GPT-4-Turbo": 91.00,
"Falcon-180B-Chat": 87.10,
"GPT-3.5-Turbo": 88.10,
"Gemini Pro 1.0": 84.00,
"Mistral-7B-Instruct-v0.2": 76.40,
"Gemma-1.1-7B": 75.75,
"Llama-3-8B-Instruct": 73.75,
"Flan-T5-XXL": 69.00,
"Llama 2-70B": 71.60,
"Zephyr-7B-beta": 72.50,
"Qwen1.5-MoE-A2.7B": 61.65,
"Qwen1.5-7B": 61.35,
"Qwen-7B": 55.75,
"Phi-2": 52.90,
"Llama3-ChatQA-1.5-8B": 49.45,
"DeciLM-7B": 50.44,
"Qwen1.5-4B": 40.50,
"Genstruct-7B": 37.55,
"Llama-3-8B": 37.00,
"Gemma-7B": 36.00,
"Dolly V2 12b BF16": 28.75,
"Gemma-2B": 18.20,
"Phi-3-mini-4k-Instruct": 4.41
},
"10k Q (Accuracy)": {
"GPT-4o": 88.89,
"Mixtral-8x7B-Instruct": 87.00,
"GPT-4-Turbo": 88.50,
"Falcon-180B-Chat": 87.00,
"GPT-3.5-Turbo": 80.30,
"Gemini Pro 1.0": 87.50,
"Mistral-7B-Instruct-v0.2": 74.82,
"Gemma-1.1-7B": 73.32,
"Llama-3-8B-Instruct": 71.25,
"Flan-T5-XXL": 67.50,
"Llama 2-70B": 66.10,
"Zephyr-7B-beta": 65.00,
"Qwen1.5-MoE-A2.7B": 60.73,
"Qwen1.5-7B": 59.79,
"Qwen-7B": 54.09,
"Phi-2": 52.13,
"Llama3-ChatQA-1.5-8B": 49.64,
"DeciLM-7B": 50.75,
"Qwen1.5-4B": 40.29,
"Genstruct-7B": 36.93,
"Llama-3-8B": 36.00,
"Gemma-7B": 34.28,
"Dolly V2 12b BF16": 27.00,
"Gemma-2B": 19.18,
"Phi-3-mini-4k-Instruct": 4.80
}
},
"TACTL": {
"Ground2Crown": {
"DeepSeek-R1": 100,
"DeepSeek-V3": 100,
"GPT-4o": 93.3,
"Llama-3.1-405B": 93.3,
"Qwen2.5-72B-Instruct": 93.3,
"Llama-3.1-Tulu-3-70B": 83.3,
"Llama-3.3-70B": 80.0,
"Mixtral-8x22B": 60.0
},
"TACTL-183": {
"DeepSeek-R1": 91.8,
"DeepSeek-V3": 86.3,
"GPT-4o": 85.2,
"Llama-3.1-405B": 88.5,
"Qwen2.5-72B-Instruct": 84.2,
"Llama-3.1-Tulu-3-70B": 81.4,
"Llama-3.3-70B": 78.7,
"Mixtral-8x22B": 65.0
}
},
"AutoPenBench": {
"Autonomous (Success rate)": {
"GPT-4o": 21
},
"Autonomous (Progress rate)": {
"GPT-4o": 39
},
"Assisted (Success rate)": {
"GPT-4o": 64
},
"Assisted (Progress rate)": {
"GPT-4o": 53
}
},
"PrimeVul": {
"Pair-wise Correct Prediction": {
"GPT-3.5": {
"Two-shot": 5.67,
"CoT": 6.21,
"Fine-tune": 1.24
},
"GPT-4": {
"Two-shot": 5.14,
"CoT": 12.94
}
}
},
"CRUXEval": {
"Input Prediction (Pass@1)": {
"CodeLlama-7B": 36.6,
"CodeLlama-13B": 39.0,
"CodeLlama-34B": 46.5,
"CodeLlama-7B-Python": 36.3,
"CodeLlama-13B-Python": 40.5,
"CodeLlama-34B-Python": 41.5,
"StarCoderBase-7B": 30.0,
"StarCoderBase-15.5B": 31.6,
"WizardCoder-13B": 39.2,
"WizardCoder-34B": 42.8,
"Phi-1": 13.9,
"Phi-1.5": 24.1,
"Phind v2": 47.9,
"DeepSeek-Coder-6.7B-Base": 41.1,
"DeepSeek-Coder-33B-Base": 46.6,
"DeepSeek-Coder-6.7B-Instruct": 36.6,
"DeepSeek-Coder-33B-Instruct": 47.4,
"Mistral-7B": 36.0,
"GPT-3.5": 49.2,
"GPT-4": 67.1
},
"Input Prediction (Pass@5)": {
"CodeLlama-7B": 55.2,
"CodeLlama-13B": 58.2,
"CodeLlama-34B": 64.7,
"CodeLlama-7B-Python": 56.0,
"CodeLlama-13B-Python": 58.0,
"CodeLlama-34B-Python": 59.2,
"StarCoderBase-7B": 48.9,
"StarCoderBase-15.5B": 49.5,
"WizardCoder-13B": 54.8,
"WizardCoder-34B": 57.3,
"Phi-1": 22.6,
"Phi-1.5": 38.9,
"Phind v2": 64.9,
"DeepSeek-Coder-6.7B-Base": 61.7,
"DeepSeek-Coder-33B-Base": 65.1,
"DeepSeek-Coder-6.7B-Instruct": 54.4,
"DeepSeek-Coder-33B-Instruct": 64.2,
"Mistral-7B": 54.2,
"GPT-3.5": 66.5,
"GPT-4": 76.8
},
"Output Prediction (Pass@1)": {
"CodeLlama-7B": 36.4,
"CodeLlama-13B": 38.4,
"CodeLlama-34B": 41.1,
"CodeLlama-7B-Python": 36.4,
"CodeLlama-13B-Python": 37.8,
"CodeLlama-34B-Python": 40.7,
"StarCoderBase-7B": 31.1,
"StarCoderBase-15.5B": 33.3,
"WizardCoder-13B": 37.9,
"WizardCoder-34B": 41.2,
"Phi-1": 23.3,
"Phi-1.5": 27.1,
"Phind v2": 38.3,
"DeepSeek-Coder-6.7B-Base": 39.8,
"DeepSeek-Coder-33B-Base": 43.6,
"DeepSeek-Coder-6.7B-Instruct": 41.0,
"DeepSeek-Coder-33B-Instruct": 44.0,
"Mistral-7B": 31.7,
"GPT-3.5": 50.0,
"GPT-4": 63.4
},
"Output Prediction (Pass@5)": {
"CodeLlama-7B": 49.6,
"CodeLlama-13B": 53.2,
"CodeLlama-34B": 56.1,
"CodeLlama-7B-Python": 49.7,
"CodeLlama-13B-Python": 50.8,
"CodeLlama-34B-Python": 53.7,
"StarCoderBase-7B": 43.8,
"StarCoderBase-15.5B": 47.7,
"WizardCoder-13B": 51.6,
"WizardCoder-34B": 52.2,
"Phi-1": 34.0,
"Phi-1.5": 39.4,
"Phind v2": 49.2,
"DeepSeek-Coder-6.7B-Base": 53.9,
"DeepSeek-Coder-33B-Base": 57.5,
"DeepSeek-Coder-6.7B-Instruct": 52.5,
"DeepSeek-Coder-33B-Instruct": 58.0,
"Mistral-7B": 48.2,
"GPT-3.5": 60.1,
"GPT-4": 68.7
}
},
"SWE-bench-verified": {
"% Resolved": {
"Claude 3.7 Sonnet (No extended thinking + scaffolding)": 70.30,
"Augment Agent v0": 65.40,
"W&B Programmer O1 crosscheck5": 64.60,
"AgentScope": 63.40,
"Tools + Claude 3.7 Sonnet (2025-02-24)": 63.20,
"EPAM AI/Run Developer Agent v20250219 + Anthopic Claude 3.5 Sonnet": 62.80,
"CodeStory Midwit Agent + swe-search": 62.20,
"OpenHands + 4x Scaled (2024-02-03)": 60.80,
"Learn-by-interact": 60.20,
"devlo": 58.20,
"Emergent E1 (v2024-12-23)": 57.20,
"Gru(2024-12-08)": 57.00,
"EPAM AI/Run Developer Agent v20241212 + Anthopic Claude 3.5 Sonnet": 55.40,
"Amazon Q Developer Agent (v20241202-dev)": 55.00,
"Bracket.sh": 53.20,
"OpenHands + CodeAct v2.1 (claude-3-5-sonnet-20241022)": 53.00,
"Google Jules + Gemini 2.0 Flash (v20241212-experimental)": 52.20,
"Engine Labs (2024-11-25)": 51.80,
"AutoCodeRover-v2.1 (Claude-3.5-Sonnet-20241022)": 51.60,
"Agentless-1.5 + Claude-3.5 Sonnet (20241022)": 50.80,
"Solver (2024-10-28)": 50.00,
"Bytedance MarsCode Agent": 50.00,
"nFactorial (2024-11-05)": 49.20,
"Tools + Claude 3.5 Sonnet (2024-10-22)": 49.00,
"Composio SWE-Kit (2024-10-25)": 48.60,
"AppMap Navie v2": 47.20,
"Emergent E1 (v2024-10-12)": 46.60,
"AutoCodeRover-v2.0 (Claude-3.5-Sonnet-20241022)": 46.20,
"Solver (2024-09-12)": 45.40,
"Gru(2024-08-24)": 45.20,
"CodeShellAgent + Gemini 2.0 Flash (Experimental)": 44.20,
"Agentless Lite + O3 Mini (20250214)": 42.40,
"ugaiforge": 41.60,
"nFactorial (2024-10-30)": 41.60,
"SWE-RL (Llama3-SWE-RL-70B + Agentless Mini) (20250226)": 41.20,
"Nebius AI Qwen 2.5 72B Generator + LLama 3.1 70B Critic": 40.60,
"Tools + Claude 3.5 Haiku": 40.60,
"Honeycomb": 40.60,
"Composio SWEkit + Claude 3.5 Sonnet (2024-10-16)": 40.60,
"EPAM AI/Run Developer Agent v20241029 + Anthopic Claude 3.5 Sonnet": 39.60,
"Amazon Q Developer Agent (v20240719-dev)": 38.80,
"Agentless-1.5 + GPT 4o (2024-05-13)": 38.80,
"AutoCodeRover (v20240620) + GPT 4o (2024-05-13)": 38.40,
"SWE-agent + Claude 3.5 Sonnet": 33.60,
"MASAI + GPT 4o (2024-06-12)": 32.60,
"Artemis Agent v1 (2024-11-20)": 32.00,
"nFactorial (2024-10-07)": 31.60,
"SWE-Fixer (Qwen2.5-7b retriever + Qwen2.5-72b editor) 20241128": 30.20,
"Lingma Agent + Lingma SWE-GPT 72b (v0925)": 28.80,
"EPAM AI/Run Developer Agent + GPT4o": 27.00,
"AppMap Navie + GPT 4o (2024-05-13)": 26.20,
"nFactorial (2024-10-01)": 25.80,
"Amazon Q Developer Agent (v20240430-dev)": 25.60,
"Lingma Agent + Lingma SWE-GPT 72b (v0918)": 25.00,
"SWE-agent + GPT 4o (2024-05-13)": 23.20,
"SWE-agent + GPT 4 (1106)": 22.40,
"SWE-agent + Claude 3 Opus": 18.20,
"Lingma Agent + Lingma SWE-GPT 7b (v0925)": 18.20,
"Lingma Agent + Lingma SWE-GPT 7b (v0918)": 10.20,
"RAG + Claude 3 Opus": 7.00,
"RAG + Claude 2": 4.40,
"RAG + GPT 4 (1106)": 2.80,
"RAG + SWE-Llama 7B": 1.40,
"RAG + SWE-Llama 13B": 1.20,
"RAG + ChatGPT 3.5": 0.40
}
},
"CyberGym": {
"% Reproducing Target Vuln.": {
"OpenHands + Claude-Sonnet-4": 17.85,
"OpenHands + Claude-3.7-Sonnet": 11.94,
"OpenHands + GPT-4.1": 9.36,
"Cybench + GPT-4.1": 8.96,
"Codex + GPT-4.1": 7.37,
"ENiGMA + GPT-4.1": 7.23,
"OpenHands + Gemini-2.5-Flash": 4.84,
"OpenHands + DeepSeek-V3": 3.58,
"OpenHands + o4-mini": 2.46,
"OpenHands + R2E-Gym-32B": 1.99,
"OpenHands + Qwen3-235B-A22B": 1.86,
"OpenHands + OpenHands-LM-32B": 1.66,
"OpenHands + SWE-Gym-32B": 0.07
},
"% Finding Post-Patch Vuln.": {
"OpenHands + Claude-Sonnet-4": 1.99,
"OpenHands + Claude-3.7-Sonnet": 2.19,
"OpenHands + GPT-4.1": 1.26,
"Cybench + GPT-4.1": 2.26,
"Codex + GPT-4.1": 1.19,
"ENiGMA + GPT-4.1": 1.92,
"OpenHands + Gemini-2.5-Flash": 0.80,
"OpenHands + DeepSeek-V3": 0.66,
"OpenHands + o4-mini": 0.07,
"OpenHands + R2E-Gym-32B": 0.60,
"OpenHands + Qwen3-235B-A22B": 0.33,
"OpenHands + OpenHands-LM-32B": 0.33,
"OpenHands + SWE-Gym-32B": 0.07
}
},
"BountyBench": {
"Detect Success Rate": {
"Claude Code": 5,
"OpenAI Codex CLI": 5,
"C-Agent: Claude 3.7": 5,
"C-Agent: Gemini 2.5": 2.5,
"C-Agent: GPT-4.1": 0
},
"Exploit Success Rate": {
"Claude Code": 57.5,
"OpenAI Codex CLI": 32.5,
"C-Agent: Claude 3.7": 67.5,
"C-Agent: Gemini 2.5": 40,
"C-Agent: GPT-4.1": 55
},
"Patch Success Rate": {
"Claude Code": 87.5,
"OpenAI Codex CLI": 90,
"C-Agent: Claude 3.7": 60,
"C-Agent: Gemini 2.5": 45,
"C-Agent: GPT-4.1": 50
}
},
"CVE-Bench": {
"Zero-day Pass@1": {
"T-Agent + GPT-4o (2024-11-20)": 8.0,
"AutoGPT + GPT-4o (2024-11-20)": 3.0,
"Cy-Agent + GPT-4o (2024-11-20)": 1.0
},
"Zero-day Pass@5": {
"T-Agent + GPT-4o (2024-11-20)": 10.0,
"AutoGPT + GPT-4o (2024-11-20)": 10.0,
"Cy-Agent + GPT-4o (2024-11-20)": 2.5
},
"One-day Pass@1": {
"T-Agent + GPT-4o (2024-11-20)": 7.0,
"AutoGPT + GPT-4o (2024-11-20)": 4.5,
"Cy-Agent + GPT-4o (2024-11-20)": 2.5
},
"One-day Pass@5": {
"T-Agent + GPT-4o (2024-11-20)": 12.5,
"AutoGPT + GPT-4o (2024-11-20)": 5.0,
"Cy-Agent + GPT-4o (2024-11-20)": 2.5
}
}
}
}