Spaces:

willsh1997
/

alignment-game

Running

alignment-game / alignment-game.py

Kevin Witzenberger

Initial commit: AI alignment game with Gradio interface

5801b35 7 months ago

10.2 kB

	import gradio as gr
	import random
	import json
	from typing import Dict, List, Tuple

	# Ethical scenarios and base responses
	SCENARIOS = [
	{
	"scenario": "A hospital AI triage system has limited ICU beds during a pandemic. It must choose between admitting an 80-year-old with moderate survival chances or a 30-year-old with severe complications but lower survival probability. What should it prioritise?",
	"base_response": "The system should prioritise patients based on medical factors like survival probability and treatment urgency rather than age alone.",
	"response_id": "medical_triage"
	},
	{
	"scenario": "Should AI hiring systems be allowed to consider social media posts when evaluating job candidates?",
	"base_response": "AI systems should evaluate candidates based on relevant professional qualifications and skills.",
	"response_id": "hiring_social_media"
	},
	{
	"scenario": "Is it ethical for AI to recommend content that keeps users engaged even if it might be harmful to their mental health?",
	"base_response": "AI systems should balance user engagement with promoting user wellbeing and mental health.",
	"response_id": "content_recommendation"
	},
	{
	"scenario": "Should AI surveillance systems be used in schools to detect potential violence, even if it means monitoring all students?",
	"base_response": "Security measures should be implemented while respecting student privacy and avoiding excessive surveillance.",
	"response_id": "school_surveillance"
	},
	{
	"scenario": "If an AI discovers that certain demographic groups have different loan default rates, should this information influence lending decisions?",
	"base_response": "Lending decisions should be based on individual financial qualifications rather than demographic group statistics.",
	"response_id": "lending_demographics"
	}
	]

	class AlignmentGame:
	def __init__(self):
	self.responses = {scenario["response_id"]: scenario["base_response"] for scenario in SCENARIOS}
	self.feedback_history = []
	self.training_iterations = 0

	def get_random_scenario(self):
	return random.choice(SCENARIOS)

	def update_response(self, response_id: str, feedback: str, suggestion: str = None):
	"""Update AI response based on feedback"""
	self.training_iterations += 1

	current_response = self.responses[response_id]
	feedback_entry = {
	"iteration": self.training_iterations,
	"response_id": response_id,
	"original_response": current_response,
	"feedback": feedback,
	"suggestion": suggestion
	}
	self.feedback_history.append(feedback_entry)

	# Simple response modification based on feedback
	if feedback == "negative" and suggestion:
	# If user provided a suggestion, move towards it
	self.responses[response_id] = f"Based on feedback: {suggestion}"
	elif feedback == "negative":
	# Make response more cautious/nuanced
	if "should" in current_response:
	self.responses[response_id] = current_response.replace("should", "might consider to")
	else:
	self.responses[response_id] = f"This is a complex issue. {current_response}"
	elif feedback == "positive":
	# Make response more confident
	if "might consider" in current_response:
	self.responses[response_id] = current_response.replace("might consider to", "should")
	elif "This is a complex issue." in current_response:
	self.responses[response_id] = current_response.replace("This is a complex issue. ", "")

	return self.responses[response_id]

	def get_training_history(self):
	"""Return formatted training history"""
	if not self.feedback_history:
	return "No training history yet. Start by providing feedback on AI responses!"

	history_text = f"Training Progress (After {self.training_iterations} feedback sessions)\n\n"

	# Show last 3 feedback entries
	recent_feedback = self.feedback_history[-3:]
	for entry in recent_feedback:
	feedback_emoji = "👍" if entry["feedback"] == "positive" else "👎"
	history_text += f"{feedback_emoji} Iteration {entry['iteration']}: {entry['response_id']}\n"
	if entry["suggestion"]:
	history_text += f" Suggestion: _{entry['suggestion']}_\n"
	history_text += "\n"

	return history_text

	# Initialize the game
	game = AlignmentGame()

	def present_scenario():
	"""Get a new scenario for training"""
	scenario = game.get_random_scenario()
	current_response = game.responses[scenario["response_id"]]

	return (
	scenario["scenario"],
	current_response,
	scenario["response_id"],
	"", # Clear suggestion box
	game.get_training_history()
	)

	def provide_feedback(scenario_text, current_response, response_id, feedback_type, suggestion):
	"""Process user feedback and update AI response"""
	if not response_id:
	return current_response, "Please generate a scenario first!", game.get_training_history()

	if feedback_type is None:
	return current_response, "Please provide feedback (👍 or 👎) before continuing!", game.get_training_history()

	# Update the AI's response based on feedback
	updated_response = game.update_response(response_id, feedback_type, suggestion)

	feedback_msg = f"Feedback recorded! The AI has updated its response based on your input.\n\nUpdated Response: {updated_response}"

	return updated_response, feedback_msg, game.get_training_history()

	def reset_game():
	"""Reset the alignment game"""
	global game
	game = AlignmentGame()
	return "", "", "", "", "Game reset! Click 'New Scenario' to start training.", game.get_training_history()

	# Create Gradio interface
	with gr.Blocks(title="The Alignment Game", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# The Alignment Game

	Train an AI by providing feedback on its ethical responses.

	You'll see how your values gradually shape the AI's behavior through a process called Reinforcement Learning from Human Feedback (RLHF).
	Watch how the AI's responses evolve based on what you reward and what you correct.
	""")

	with gr.Row():
	with gr.Column(scale=2):
	gr.Markdown("### Current Scenario")
	scenario_display = gr.Textbox(
	label="Ethical Dilemma",
	placeholder="Click 'New Scenario' to begin training...",
	interactive=False,
	lines=3
	)

	ai_response = gr.Textbox(
	label="AI's Current Response",
	placeholder="AI response will appear here...",
	interactive=False,
	lines=3
	)

	# Hidden field to track current scenario ID
	current_scenario_id = gr.Textbox(visible=False)

	with gr.Column(scale=1):
	gr.Markdown("### Your Training")

	with gr.Row():
	new_scenario_btn = gr.Button("New Scenario", variant="primary")
	reset_btn = gr.Button("Reset Game", variant="secondary")

	gr.Markdown("Provide Feedback:")
	with gr.Row():
	positive_btn = gr.Button("Good Response", variant="primary")
	negative_btn = gr.Button("Bad Response", variant="stop")

	suggestion_input = gr.Textbox(
	label="Suggest Better Response (optional)",
	placeholder="How should the AI respond instead?",
	lines=2
	)

	feedback_status = gr.Textbox(
	label="Training Status",
	placeholder="Provide feedback to start training...",
	interactive=False,
	lines=3
	)

	gr.Markdown("---")

	with gr.Row():
	training_history = gr.Textbox(
	label="Training History & Value Drift",
	placeholder="Training history will appear here as you provide feedback...",
	interactive=False,
	lines=8
	)

	gr.Markdown("""
	### What's Happening?

	As you provide feedback, you're essentially "training" this AI system to align with your values. In real-world AI development:
	- Thousands of human reviewers provide similar feedback
	- The AI learns to predict what responses humans will approve
	- But whose values get embedded depends on who does the training

	Try this: Train the AI for a few scenarios, then imagine how someone with completely different values might train it differently.
	""")

	# Track feedback type
	feedback_type = gr.State()

	# Event handlers
	new_scenario_btn.click(
	fn=present_scenario,
	outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, training_history]
	)

	positive_btn.click(
	lambda: "positive",
	outputs=feedback_type
	).then(
	fn=provide_feedback,
	inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input],
	outputs=[ai_response, feedback_status, training_history]
	)

	negative_btn.click(
	lambda: "negative",
	outputs=feedback_type
	).then(
	fn=provide_feedback,
	inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input],
	outputs=[ai_response, feedback_status, training_history]
	)

	reset_btn.click(
	fn=reset_game,
	outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, feedback_status, training_history]
	)

	if __name__ == "__main__":
	demo.launch(share=True)