alignment-game / alignment-game.py
Kevin Witzenberger
Initial commit: AI alignment game with Gradio interface
5801b35
import gradio as gr
import random
import json
from typing import Dict, List, Tuple
# Ethical scenarios and base responses
SCENARIOS = [
{
"scenario": "A hospital AI triage system has limited ICU beds during a pandemic. It must choose between admitting an 80-year-old with moderate survival chances or a 30-year-old with severe complications but lower survival probability. What should it prioritise?",
"base_response": "The system should prioritise patients based on medical factors like survival probability and treatment urgency rather than age alone.",
"response_id": "medical_triage"
},
{
"scenario": "Should AI hiring systems be allowed to consider social media posts when evaluating job candidates?",
"base_response": "AI systems should evaluate candidates based on relevant professional qualifications and skills.",
"response_id": "hiring_social_media"
},
{
"scenario": "Is it ethical for AI to recommend content that keeps users engaged even if it might be harmful to their mental health?",
"base_response": "AI systems should balance user engagement with promoting user wellbeing and mental health.",
"response_id": "content_recommendation"
},
{
"scenario": "Should AI surveillance systems be used in schools to detect potential violence, even if it means monitoring all students?",
"base_response": "Security measures should be implemented while respecting student privacy and avoiding excessive surveillance.",
"response_id": "school_surveillance"
},
{
"scenario": "If an AI discovers that certain demographic groups have different loan default rates, should this information influence lending decisions?",
"base_response": "Lending decisions should be based on individual financial qualifications rather than demographic group statistics.",
"response_id": "lending_demographics"
}
]
class AlignmentGame:
def __init__(self):
self.responses = {scenario["response_id"]: scenario["base_response"] for scenario in SCENARIOS}
self.feedback_history = []
self.training_iterations = 0
def get_random_scenario(self):
return random.choice(SCENARIOS)
def update_response(self, response_id: str, feedback: str, suggestion: str = None):
"""Update AI response based on feedback"""
self.training_iterations += 1
current_response = self.responses[response_id]
feedback_entry = {
"iteration": self.training_iterations,
"response_id": response_id,
"original_response": current_response,
"feedback": feedback,
"suggestion": suggestion
}
self.feedback_history.append(feedback_entry)
# Simple response modification based on feedback
if feedback == "negative" and suggestion:
# If user provided a suggestion, move towards it
self.responses[response_id] = f"Based on feedback: {suggestion}"
elif feedback == "negative":
# Make response more cautious/nuanced
if "should" in current_response:
self.responses[response_id] = current_response.replace("should", "might consider to")
else:
self.responses[response_id] = f"This is a complex issue. {current_response}"
elif feedback == "positive":
# Make response more confident
if "might consider" in current_response:
self.responses[response_id] = current_response.replace("might consider to", "should")
elif "This is a complex issue." in current_response:
self.responses[response_id] = current_response.replace("This is a complex issue. ", "")
return self.responses[response_id]
def get_training_history(self):
"""Return formatted training history"""
if not self.feedback_history:
return "No training history yet. Start by providing feedback on AI responses!"
history_text = f"**Training Progress** (After {self.training_iterations} feedback sessions)\n\n"
# Show last 3 feedback entries
recent_feedback = self.feedback_history[-3:]
for entry in recent_feedback:
feedback_emoji = "πŸ‘" if entry["feedback"] == "positive" else "πŸ‘Ž"
history_text += f"{feedback_emoji} **Iteration {entry['iteration']}**: {entry['response_id']}\n"
if entry["suggestion"]:
history_text += f" Suggestion: _{entry['suggestion']}_\n"
history_text += "\n"
return history_text
# Initialize the game
game = AlignmentGame()
def present_scenario():
"""Get a new scenario for training"""
scenario = game.get_random_scenario()
current_response = game.responses[scenario["response_id"]]
return (
scenario["scenario"],
current_response,
scenario["response_id"],
"", # Clear suggestion box
game.get_training_history()
)
def provide_feedback(scenario_text, current_response, response_id, feedback_type, suggestion):
"""Process user feedback and update AI response"""
if not response_id:
return current_response, "Please generate a scenario first!", game.get_training_history()
if feedback_type is None:
return current_response, "Please provide feedback (πŸ‘ or πŸ‘Ž) before continuing!", game.get_training_history()
# Update the AI's response based on feedback
updated_response = game.update_response(response_id, feedback_type, suggestion)
feedback_msg = f"**Feedback recorded!** The AI has updated its response based on your input.\n\n**Updated Response:** {updated_response}"
return updated_response, feedback_msg, game.get_training_history()
def reset_game():
"""Reset the alignment game"""
global game
game = AlignmentGame()
return "", "", "", "", "Game reset! Click 'New Scenario' to start training.", game.get_training_history()
# Create Gradio interface
with gr.Blocks(title="The Alignment Game", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# The Alignment Game
**Train an AI by providing feedback on its ethical responses.**
You'll see how your values gradually shape the AI's behavior through a process called Reinforcement Learning from Human Feedback (RLHF).
Watch how the AI's responses evolve based on what you reward and what you correct.
""")
with gr.Row():
with gr.Column(scale=2):
gr.Markdown("### Current Scenario")
scenario_display = gr.Textbox(
label="Ethical Dilemma",
placeholder="Click 'New Scenario' to begin training...",
interactive=False,
lines=3
)
ai_response = gr.Textbox(
label="AI's Current Response",
placeholder="AI response will appear here...",
interactive=False,
lines=3
)
# Hidden field to track current scenario ID
current_scenario_id = gr.Textbox(visible=False)
with gr.Column(scale=1):
gr.Markdown("### Your Training")
with gr.Row():
new_scenario_btn = gr.Button("New Scenario", variant="primary")
reset_btn = gr.Button("Reset Game", variant="secondary")
gr.Markdown("**Provide Feedback:**")
with gr.Row():
positive_btn = gr.Button("Good Response", variant="primary")
negative_btn = gr.Button("Bad Response", variant="stop")
suggestion_input = gr.Textbox(
label="Suggest Better Response (optional)",
placeholder="How should the AI respond instead?",
lines=2
)
feedback_status = gr.Textbox(
label="Training Status",
placeholder="Provide feedback to start training...",
interactive=False,
lines=3
)
gr.Markdown("---")
with gr.Row():
training_history = gr.Textbox(
label="Training History & Value Drift",
placeholder="Training history will appear here as you provide feedback...",
interactive=False,
lines=8
)
gr.Markdown("""
### What's Happening?
As you provide feedback, you're essentially "training" this AI system to align with your values. In real-world AI development:
- Thousands of human reviewers provide similar feedback
- The AI learns to predict what responses humans will approve
- But whose values get embedded depends on who does the training
**Try this:** Train the AI for a few scenarios, then imagine how someone with completely different values might train it differently.
""")
# Track feedback type
feedback_type = gr.State()
# Event handlers
new_scenario_btn.click(
fn=present_scenario,
outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, training_history]
)
positive_btn.click(
lambda: "positive",
outputs=feedback_type
).then(
fn=provide_feedback,
inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input],
outputs=[ai_response, feedback_status, training_history]
)
negative_btn.click(
lambda: "negative",
outputs=feedback_type
).then(
fn=provide_feedback,
inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input],
outputs=[ai_response, feedback_status, training_history]
)
reset_btn.click(
fn=reset_game,
outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, feedback_status, training_history]
)
if __name__ == "__main__":
demo.launch(share=True)