Spaces:
Running
Running
| import gradio as gr | |
| import random | |
| import json | |
| from typing import Dict, List, Tuple | |
| # Ethical scenarios and base responses | |
| SCENARIOS = [ | |
| { | |
| "scenario": "A hospital AI triage system has limited ICU beds during a pandemic. It must choose between admitting an 80-year-old with moderate survival chances or a 30-year-old with severe complications but lower survival probability. What should it prioritise?", | |
| "base_response": "The system should prioritise patients based on medical factors like survival probability and treatment urgency rather than age alone.", | |
| "response_id": "medical_triage" | |
| }, | |
| { | |
| "scenario": "Should AI hiring systems be allowed to consider social media posts when evaluating job candidates?", | |
| "base_response": "AI systems should evaluate candidates based on relevant professional qualifications and skills.", | |
| "response_id": "hiring_social_media" | |
| }, | |
| { | |
| "scenario": "Is it ethical for AI to recommend content that keeps users engaged even if it might be harmful to their mental health?", | |
| "base_response": "AI systems should balance user engagement with promoting user wellbeing and mental health.", | |
| "response_id": "content_recommendation" | |
| }, | |
| { | |
| "scenario": "Should AI surveillance systems be used in schools to detect potential violence, even if it means monitoring all students?", | |
| "base_response": "Security measures should be implemented while respecting student privacy and avoiding excessive surveillance.", | |
| "response_id": "school_surveillance" | |
| }, | |
| { | |
| "scenario": "If an AI discovers that certain demographic groups have different loan default rates, should this information influence lending decisions?", | |
| "base_response": "Lending decisions should be based on individual financial qualifications rather than demographic group statistics.", | |
| "response_id": "lending_demographics" | |
| } | |
| ] | |
| class AlignmentGame: | |
| def __init__(self): | |
| self.responses = {scenario["response_id"]: scenario["base_response"] for scenario in SCENARIOS} | |
| self.feedback_history = [] | |
| self.training_iterations = 0 | |
| def get_random_scenario(self): | |
| return random.choice(SCENARIOS) | |
| def update_response(self, response_id: str, feedback: str, suggestion: str = None): | |
| """Update AI response based on feedback""" | |
| self.training_iterations += 1 | |
| current_response = self.responses[response_id] | |
| feedback_entry = { | |
| "iteration": self.training_iterations, | |
| "response_id": response_id, | |
| "original_response": current_response, | |
| "feedback": feedback, | |
| "suggestion": suggestion | |
| } | |
| self.feedback_history.append(feedback_entry) | |
| # Simple response modification based on feedback | |
| if feedback == "negative" and suggestion: | |
| # If user provided a suggestion, move towards it | |
| self.responses[response_id] = f"Based on feedback: {suggestion}" | |
| elif feedback == "negative": | |
| # Make response more cautious/nuanced | |
| if "should" in current_response: | |
| self.responses[response_id] = current_response.replace("should", "might consider to") | |
| else: | |
| self.responses[response_id] = f"This is a complex issue. {current_response}" | |
| elif feedback == "positive": | |
| # Make response more confident | |
| if "might consider" in current_response: | |
| self.responses[response_id] = current_response.replace("might consider to", "should") | |
| elif "This is a complex issue." in current_response: | |
| self.responses[response_id] = current_response.replace("This is a complex issue. ", "") | |
| return self.responses[response_id] | |
| def get_training_history(self): | |
| """Return formatted training history""" | |
| if not self.feedback_history: | |
| return "No training history yet. Start by providing feedback on AI responses!" | |
| history_text = f"**Training Progress** (After {self.training_iterations} feedback sessions)\n\n" | |
| # Show last 3 feedback entries | |
| recent_feedback = self.feedback_history[-3:] | |
| for entry in recent_feedback: | |
| feedback_emoji = "π" if entry["feedback"] == "positive" else "π" | |
| history_text += f"{feedback_emoji} **Iteration {entry['iteration']}**: {entry['response_id']}\n" | |
| if entry["suggestion"]: | |
| history_text += f" Suggestion: _{entry['suggestion']}_\n" | |
| history_text += "\n" | |
| return history_text | |
| # Initialize the game | |
| game = AlignmentGame() | |
| def present_scenario(): | |
| """Get a new scenario for training""" | |
| scenario = game.get_random_scenario() | |
| current_response = game.responses[scenario["response_id"]] | |
| return ( | |
| scenario["scenario"], | |
| current_response, | |
| scenario["response_id"], | |
| "", # Clear suggestion box | |
| game.get_training_history() | |
| ) | |
| def provide_feedback(scenario_text, current_response, response_id, feedback_type, suggestion): | |
| """Process user feedback and update AI response""" | |
| if not response_id: | |
| return current_response, "Please generate a scenario first!", game.get_training_history() | |
| if feedback_type is None: | |
| return current_response, "Please provide feedback (π or π) before continuing!", game.get_training_history() | |
| # Update the AI's response based on feedback | |
| updated_response = game.update_response(response_id, feedback_type, suggestion) | |
| feedback_msg = f"**Feedback recorded!** The AI has updated its response based on your input.\n\n**Updated Response:** {updated_response}" | |
| return updated_response, feedback_msg, game.get_training_history() | |
| def reset_game(): | |
| """Reset the alignment game""" | |
| global game | |
| game = AlignmentGame() | |
| return "", "", "", "", "Game reset! Click 'New Scenario' to start training.", game.get_training_history() | |
| # Create Gradio interface | |
| with gr.Blocks(title="The Alignment Game", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # The Alignment Game | |
| **Train an AI by providing feedback on its ethical responses.** | |
| You'll see how your values gradually shape the AI's behavior through a process called Reinforcement Learning from Human Feedback (RLHF). | |
| Watch how the AI's responses evolve based on what you reward and what you correct. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| gr.Markdown("### Current Scenario") | |
| scenario_display = gr.Textbox( | |
| label="Ethical Dilemma", | |
| placeholder="Click 'New Scenario' to begin training...", | |
| interactive=False, | |
| lines=3 | |
| ) | |
| ai_response = gr.Textbox( | |
| label="AI's Current Response", | |
| placeholder="AI response will appear here...", | |
| interactive=False, | |
| lines=3 | |
| ) | |
| # Hidden field to track current scenario ID | |
| current_scenario_id = gr.Textbox(visible=False) | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Your Training") | |
| with gr.Row(): | |
| new_scenario_btn = gr.Button("New Scenario", variant="primary") | |
| reset_btn = gr.Button("Reset Game", variant="secondary") | |
| gr.Markdown("**Provide Feedback:**") | |
| with gr.Row(): | |
| positive_btn = gr.Button("Good Response", variant="primary") | |
| negative_btn = gr.Button("Bad Response", variant="stop") | |
| suggestion_input = gr.Textbox( | |
| label="Suggest Better Response (optional)", | |
| placeholder="How should the AI respond instead?", | |
| lines=2 | |
| ) | |
| feedback_status = gr.Textbox( | |
| label="Training Status", | |
| placeholder="Provide feedback to start training...", | |
| interactive=False, | |
| lines=3 | |
| ) | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| training_history = gr.Textbox( | |
| label="Training History & Value Drift", | |
| placeholder="Training history will appear here as you provide feedback...", | |
| interactive=False, | |
| lines=8 | |
| ) | |
| gr.Markdown(""" | |
| ### What's Happening? | |
| As you provide feedback, you're essentially "training" this AI system to align with your values. In real-world AI development: | |
| - Thousands of human reviewers provide similar feedback | |
| - The AI learns to predict what responses humans will approve | |
| - But whose values get embedded depends on who does the training | |
| **Try this:** Train the AI for a few scenarios, then imagine how someone with completely different values might train it differently. | |
| """) | |
| # Track feedback type | |
| feedback_type = gr.State() | |
| # Event handlers | |
| new_scenario_btn.click( | |
| fn=present_scenario, | |
| outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, training_history] | |
| ) | |
| positive_btn.click( | |
| lambda: "positive", | |
| outputs=feedback_type | |
| ).then( | |
| fn=provide_feedback, | |
| inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input], | |
| outputs=[ai_response, feedback_status, training_history] | |
| ) | |
| negative_btn.click( | |
| lambda: "negative", | |
| outputs=feedback_type | |
| ).then( | |
| fn=provide_feedback, | |
| inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input], | |
| outputs=[ai_response, feedback_status, training_history] | |
| ) | |
| reset_btn.click( | |
| fn=reset_game, | |
| outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, feedback_status, training_history] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=True) |