{ "policy_vs_scripted": 0.42, "policy_vs_llama_base": 0.93, "policy_vs_llama_trained": 0.0, "policy_vs_qwen": 0.22, "elo_ratings": { "policy_net": 1600.6559558539877, "llama_base": 1285.1555682713138, "llama_trained": 1589.413172204076, "qwen": 1524.775303670623 }, "elo_rankings": [ { "agent": "policy_net", "elo": 1600.6559558539877 }, { "agent": "llama_trained", "elo": 1589.413172204076 }, { "agent": "qwen", "elo": 1524.775303670623 }, { "agent": "llama_base", "elo": 1285.1555682713138 } ], "details": { "policy_vs_scripted": { "total_games": 200, "red_wins": 84, "blue_wins": 0, "draws": 116, "winrate_%": "42.00", "draw_rate_%": "58.00", "avg_game_length": "36.66", "assassin_losses": 82, "red_assassin_hits": 0, "blue_assassin_hits": 82, "assassin_rate_%": "41.00", "total_clues": 3696, "total_guesses": 1608, "correct_guesses": 517, "wrong_guesses": 1009, "guess_accuracy_%": "33.88", "passes": 2503, "invalid_actions": 0 }, "policy_vs_llama_base": { "total_games": 100, "red_wins": 93, "blue_wins": 0, "draws": 7, "winrate_%": "93.00", "draw_rate_%": "7.00", "avg_game_length": "12.03", "assassin_losses": 15, "red_assassin_hits": 10, "blue_assassin_hits": 5, "assassin_rate_%": "15.00", "total_clues": 604, "total_guesses": 167, "correct_guesses": 51, "wrong_guesses": 101, "guess_accuracy_%": "33.55", "passes": 367, "invalid_actions": 108 }, "policy_vs_llama_trained": { "total_games": 100, "red_wins": 0, "blue_wins": 0, "draws": 100, "winrate_%": "0.00", "draw_rate_%": "100.00", "avg_game_length": "49.99", "assassin_losses": 0, "red_assassin_hits": 0, "blue_assassin_hits": 0, "assassin_rate_%": "0.00", "total_clues": 2500, "total_guesses": 9, "correct_guesses": 1, "wrong_guesses": 8, "guess_accuracy_%": "11.11", "passes": 1921, "invalid_actions": 570 }, "policy_vs_qwen": { "total_games": 100, "red_wins": 22, "blue_wins": 0, "draws": 78, "winrate_%": "22.00", "draw_rate_%": "78.00", "avg_game_length": "42.60", "assassin_losses": 4, "red_assassin_hits": 1, "blue_assassin_hits": 3, "assassin_rate_%": "4.00", "total_clues": 2141, "total_guesses": 85, "correct_guesses": 32, "wrong_guesses": 49, "guess_accuracy_%": "39.51", "passes": 2009, "invalid_actions": 56 }, "llama_base_vs_llama_trained": { "total_games": 60, "red_wins": 1, "blue_wins": 53, "draws": 6, "winrate_%": "1.67", "draw_rate_%": "10.00", "avg_game_length": "12.27", "assassin_losses": 9, "red_assassin_hits": 8, "blue_assassin_hits": 1, "assassin_rate_%": "15.00", "total_clues": 369, "total_guesses": 107, "correct_guesses": 34, "wrong_guesses": 64, "guess_accuracy_%": "34.69", "passes": 140, "invalid_actions": 150 }, "llama_base_vs_qwen": { "total_games": 60, "red_wins": 10, "blue_wins": 47, "draws": 3, "winrate_%": "16.67", "draw_rate_%": "5.00", "avg_game_length": "10.93", "assassin_losses": 9, "red_assassin_hits": 8, "blue_assassin_hits": 1, "assassin_rate_%": "15.00", "total_clues": 329, "total_guesses": 118, "correct_guesses": 40, "wrong_guesses": 69, "guess_accuracy_%": "36.70", "passes": 155, "invalid_actions": 91 }, "llama_trained_vs_qwen": { "total_games": 60, "red_wins": 12, "blue_wins": 0, "draws": 48, "winrate_%": "20.00", "draw_rate_%": "80.00", "avg_game_length": "43.58", "assassin_losses": 2, "red_assassin_hits": 0, "blue_assassin_hits": 2, "assassin_rate_%": "3.33", "total_clues": 1316, "total_guesses": 64, "correct_guesses": 19, "wrong_guesses": 43, "guess_accuracy_%": "30.65", "passes": 920, "invalid_actions": 333 } } }