GOVINDFROM commited on
Commit
e91ffab
·
verified ·
1 Parent(s): 1f81885

Upload battleground_eval.json

Browse files
Files changed (1) hide show
  1. battleground_eval.json +181 -0
battleground_eval.json ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "policy_vs_scripted": 0.42,
3
+ "policy_vs_llama_base": 0.93,
4
+ "policy_vs_llama_trained": 0.0,
5
+ "policy_vs_qwen": 0.22,
6
+
7
+ "elo_ratings": {
8
+ "policy_net": 1600.6559558539877,
9
+ "llama_base": 1285.1555682713138,
10
+ "llama_trained": 1589.413172204076,
11
+ "qwen": 1524.775303670623
12
+ },
13
+
14
+ "elo_rankings": [
15
+ {
16
+ "agent": "policy_net",
17
+ "elo": 1600.6559558539877
18
+ },
19
+ {
20
+ "agent": "llama_trained",
21
+ "elo": 1589.413172204076
22
+ },
23
+ {
24
+ "agent": "qwen",
25
+ "elo": 1524.775303670623
26
+ },
27
+ {
28
+ "agent": "llama_base",
29
+ "elo": 1285.1555682713138
30
+ }
31
+ ],
32
+
33
+ "details": {
34
+ "policy_vs_scripted": {
35
+ "total_games": 200,
36
+ "red_wins": 84,
37
+ "blue_wins": 0,
38
+ "draws": 116,
39
+ "winrate_%": "42.00",
40
+ "draw_rate_%": "58.00",
41
+ "avg_game_length": "36.66",
42
+ "assassin_losses": 82,
43
+ "red_assassin_hits": 0,
44
+ "blue_assassin_hits": 82,
45
+ "assassin_rate_%": "41.00",
46
+ "total_clues": 3696,
47
+ "total_guesses": 1608,
48
+ "correct_guesses": 517,
49
+ "wrong_guesses": 1009,
50
+ "guess_accuracy_%": "33.88",
51
+ "passes": 2503,
52
+ "invalid_actions": 0
53
+ },
54
+
55
+ "policy_vs_llama_base": {
56
+ "total_games": 100,
57
+ "red_wins": 93,
58
+ "blue_wins": 0,
59
+ "draws": 7,
60
+ "winrate_%": "93.00",
61
+ "draw_rate_%": "7.00",
62
+ "avg_game_length": "12.03",
63
+ "assassin_losses": 15,
64
+ "red_assassin_hits": 10,
65
+ "blue_assassin_hits": 5,
66
+ "assassin_rate_%": "15.00",
67
+ "total_clues": 604,
68
+ "total_guesses": 167,
69
+ "correct_guesses": 51,
70
+ "wrong_guesses": 101,
71
+ "guess_accuracy_%": "33.55",
72
+ "passes": 367,
73
+ "invalid_actions": 108
74
+ },
75
+
76
+ "policy_vs_llama_trained": {
77
+ "total_games": 100,
78
+ "red_wins": 0,
79
+ "blue_wins": 0,
80
+ "draws": 100,
81
+ "winrate_%": "0.00",
82
+ "draw_rate_%": "100.00",
83
+ "avg_game_length": "49.99",
84
+ "assassin_losses": 0,
85
+ "red_assassin_hits": 0,
86
+ "blue_assassin_hits": 0,
87
+ "assassin_rate_%": "0.00",
88
+ "total_clues": 2500,
89
+ "total_guesses": 9,
90
+ "correct_guesses": 1,
91
+ "wrong_guesses": 8,
92
+ "guess_accuracy_%": "11.11",
93
+ "passes": 1921,
94
+ "invalid_actions": 570
95
+ },
96
+
97
+ "policy_vs_qwen": {
98
+ "total_games": 100,
99
+ "red_wins": 22,
100
+ "blue_wins": 0,
101
+ "draws": 78,
102
+ "winrate_%": "22.00",
103
+ "draw_rate_%": "78.00",
104
+ "avg_game_length": "42.60",
105
+ "assassin_losses": 4,
106
+ "red_assassin_hits": 1,
107
+ "blue_assassin_hits": 3,
108
+ "assassin_rate_%": "4.00",
109
+ "total_clues": 2141,
110
+ "total_guesses": 85,
111
+ "correct_guesses": 32,
112
+ "wrong_guesses": 49,
113
+ "guess_accuracy_%": "39.51",
114
+ "passes": 2009,
115
+ "invalid_actions": 56
116
+ },
117
+
118
+ "llama_base_vs_llama_trained": {
119
+ "total_games": 60,
120
+ "red_wins": 1,
121
+ "blue_wins": 53,
122
+ "draws": 6,
123
+ "winrate_%": "1.67",
124
+ "draw_rate_%": "10.00",
125
+ "avg_game_length": "12.27",
126
+ "assassin_losses": 9,
127
+ "red_assassin_hits": 8,
128
+ "blue_assassin_hits": 1,
129
+ "assassin_rate_%": "15.00",
130
+ "total_clues": 369,
131
+ "total_guesses": 107,
132
+ "correct_guesses": 34,
133
+ "wrong_guesses": 64,
134
+ "guess_accuracy_%": "34.69",
135
+ "passes": 140,
136
+ "invalid_actions": 150
137
+ },
138
+
139
+ "llama_base_vs_qwen": {
140
+ "total_games": 60,
141
+ "red_wins": 10,
142
+ "blue_wins": 47,
143
+ "draws": 3,
144
+ "winrate_%": "16.67",
145
+ "draw_rate_%": "5.00",
146
+ "avg_game_length": "10.93",
147
+ "assassin_losses": 9,
148
+ "red_assassin_hits": 8,
149
+ "blue_assassin_hits": 1,
150
+ "assassin_rate_%": "15.00",
151
+ "total_clues": 329,
152
+ "total_guesses": 118,
153
+ "correct_guesses": 40,
154
+ "wrong_guesses": 69,
155
+ "guess_accuracy_%": "36.70",
156
+ "passes": 155,
157
+ "invalid_actions": 91
158
+ },
159
+
160
+ "llama_trained_vs_qwen": {
161
+ "total_games": 60,
162
+ "red_wins": 12,
163
+ "blue_wins": 0,
164
+ "draws": 48,
165
+ "winrate_%": "20.00",
166
+ "draw_rate_%": "80.00",
167
+ "avg_game_length": "43.58",
168
+ "assassin_losses": 2,
169
+ "red_assassin_hits": 0,
170
+ "blue_assassin_hits": 2,
171
+ "assassin_rate_%": "3.33",
172
+ "total_clues": 1316,
173
+ "total_guesses": 64,
174
+ "correct_guesses": 19,
175
+ "wrong_guesses": 43,
176
+ "guess_accuracy_%": "30.65",
177
+ "passes": 920,
178
+ "invalid_actions": 333
179
+ }
180
+ }
181
+ }