Spaces:

Phoenix21
/

MedicalHealthDatasetGenerator

Sleeping

App Files Files Community

Phoenix21 commited on Jun 29

Commit

7aa6a7e

verified ·

1 Parent(s): 02c4a63

Create app.py

Browse files

Files changed (1) hide show

app.py +985 -0

app.py ADDED Viewed

	@@ -0,0 +1,985 @@

+# Complete Medical Literature Health Dataset Generator with Gradio Interface
+#
+# This creates a web-based interface for generating synthetic health optimization datasets
+# =====================================================================
+# STEP 1: INSTALLATIONS AND IMPORTS
+# =====================================================================
+# Install required packages
+import subprocess
+import sys
+def install_packages():
+    """Install required packages"""
+    packages = ['openai', 'gradio', 'python-dotenv', 'requests', 'pandas']
+    for package in packages:
+        try:
+            __import__(package)
+        except ImportError:
+            print(f"Installing {package}...")
+            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
+# Run installation
+install_packages()
+# Import libraries
+import gradio as gr
+import json
+import random
+import re
+import time
+import os
+import io
+import zipfile
+from datetime import datetime
+from typing import Dict, List, Any, Optional, Tuple
+from openai import OpenAI
+import pandas as pd
+# =====================================================================
+# STEP 2: CORE CLASSES (Same as before but with progress callbacks)
+# =====================================================================
+class MedicalLiteratureSimulator:
+    """Simulates medical literature research for health dataset generation"""
+    def __init__(self):
+        self.research_domains = {
+            "longevity": {
+                "interventions": ["NAD+ supplementation", "resveratrol", "metformin", "caloric restriction"],
+                "biomarkers": ["telomere length", "cellular senescence", "inflammatory markers", "mitochondrial function"],
+                "outcomes": ["biological age reduction", "improved healthspan", "enhanced cellular repair"]
+            },
+            "metabolic_health": {
+                "interventions": ["berberine", "intermittent fasting", "alpha-lipoic acid", "chromium"],
+                "biomarkers": ["glucose levels", "insulin sensitivity", "HbA1c", "HOMA-IR"],
+                "outcomes": ["improved glucose control", "enhanced insulin sensitivity", "reduced inflammation"]
+            },
+            "cardiovascular": {
+                "interventions": ["omega-3 fatty acids", "coenzyme Q10", "magnesium", "nattokinase"],
+                "biomarkers": ["blood pressure", "cholesterol levels", "CRP", "endothelial function"],
+                "outcomes": ["reduced blood pressure", "improved lipid profile", "decreased inflammation"]
+            },
+            "cognitive": {
+                "interventions": ["lion's mane mushroom", "phosphatidylserine", "bacopa monnieri", "acetyl-L-carnitine"],
+                "biomarkers": ["cognitive performance", "BDNF levels", "neuroinflammation", "memory function"],
+                "outcomes": ["enhanced memory", "improved cognitive function", "neuroprotection"]
+            },
+            "hormonal": {
+                "interventions": ["ashwagandha", "vitamin D", "DHEA", "maca root"],
+                "biomarkers": ["cortisol levels", "thyroid hormones", "sex hormones", "stress markers"],
+                "outcomes": ["hormone balance", "improved energy", "better sleep quality"]
+            },
+            "inflammation": {
+                "interventions": ["curcumin", "omega-3", "quercetin", "boswellia"],
+                "biomarkers": ["CRP", "IL-6", "TNF-alpha", "oxidative stress"],
+                "outcomes": ["reduced inflammation", "improved immune function", "enhanced recovery"]
+            }
+        }
+    def generate_study_data(self, domain: str) -> Dict[str, Any]:
+        """Generate realistic medical study data"""
+        if domain not in self.research_domains:
+            domain = "longevity"
+        domain_data = self.research_domains[domain]
+        study = {
+            "pmid": f"PMID{random.randint(35000000, 40000000)}",
+            "title": self._generate_study_title(domain, domain_data),
+            "abstract": self._generate_study_abstract(domain, domain_data),
+            "journal": random.choice([
+                "Nature Medicine", "Cell Metabolism", "Journal of Clinical Medicine",
+                "Circulation", "Aging Cell", "Nutrients", "Clinical Nutrition"
+            ]),
+            "year": random.choice([2023, 2024]),
+            "domain": domain,
+            "interventions": random.sample(domain_data["interventions"], min(2, len(domain_data["interventions"]))),
+            "biomarkers": random.sample(domain_data["biomarkers"], min(3, len(domain_data["biomarkers"]))),
+            "outcomes": random.sample(domain_data["outcomes"], min(2, len(domain_data["outcomes"]))),
+            "participant_count": random.randint(50, 300),
+            "duration_weeks": random.choice([8, 12, 16, 24]),
+            "dosages": self._generate_dosages(domain_data["interventions"][0])
+        }
+        return study
+    def _generate_study_title(self, domain: str, domain_data: Dict) -> str:
+        intervention = random.choice(domain_data["interventions"])
+        outcome = random.choice(domain_data["outcomes"])
+        titles = [
+            f"Effects of {intervention} on {outcome}: A randomized controlled trial",
+            f"{intervention} supplementation improves {outcome} in healthy adults",
+            f"Clinical evaluation of {intervention} for {outcome} optimization",
+            f"Randomized trial of {intervention} in {outcome} enhancement"
+        ]
+        return random.choice(titles)
+    def _generate_study_abstract(self, domain: str, domain_data: Dict) -> str:
+        intervention = domain_data["interventions"][0]
+        biomarker = random.choice(domain_data["biomarkers"])
+        outcome = random.choice(domain_data["outcomes"])
+        abstract = f"""
+Background: {intervention} has shown promise in preliminary studies for health optimization.
+Objective: To evaluate the effects of {intervention} supplementation on {biomarker} and related health outcomes.
+Methods: Randomized, double-blind, placebo-controlled trial with {random.randint(120, 250)} participants aged 40-65 years.
+Subjects received {intervention} or placebo for {random.randint(12, 24)} weeks.
+Results: {intervention} supplementation significantly improved {outcome} compared to placebo (p<0.05).
+{biomarker.capitalize()} showed {random.randint(15, 35)}% improvement from baseline.
+Secondary outcomes included improved quality of life and no serious adverse events.
+Conclusions: {intervention} supplementation provides significant benefits for {outcome} with excellent safety profile.
+        """.strip()
+        return abstract
+    def _generate_dosages(self, intervention: str) -> List[str]:
+        dosage_ranges = {
+            "NAD+": ["250mg", "500mg", "1000mg"],
+            "resveratrol": ["100mg", "250mg", "500mg"],
+            "berberine": ["500mg", "1000mg", "1500mg"],
+            "omega-3": ["1000mg", "2000mg", "3000mg"],
+            "magnesium": ["200mg", "400mg", "600mg"],
+            "curcumin": ["500mg", "1000mg", "1500mg"]
+        }
+        for key in dosage_ranges:
+            if key.lower() in intervention.lower():
+                return random.sample(dosage_ranges[key], min(2, len(dosage_ranges[key])))
+        return ["500mg", "1000mg"]
+class HealthProfileGenerator:
+    """Generates realistic health profiles based on medical studies"""
+    def __init__(self):
+        self.severity_levels = {
+            "optimal": {"multiplier": 1.0, "description": "excellent baseline health with optimization focus"},
+            "mild": {"multiplier": 1.2, "description": "minor health concerns with good overall function"},
+            "moderate": {"multiplier": 1.5, "description": "noticeable health issues requiring intervention"},
+            "severe": {"multiplier": 2.0, "description": "significant health challenges needing intensive protocols"}
+        }
+    def generate_profile_from_study(self, study: Dict[str, Any], severity: str = "moderate") -> Dict[str, Any]:
+        """Generate complete health profile based on study data and severity level"""
+        domain = study.get("domain", "longevity")
+        severity_data = self.severity_levels.get(severity, self.severity_levels["moderate"])
+        multiplier = severity_data["multiplier"]
+        age = random.randint(35, 65)
+        gender = random.choice(["male", "female"])
+        labs = self._generate_lab_values(domain, multiplier)
+        health_profile = {
+            "user_tests_result_data": {
+                "Labs": labs,
+                "gut_microbiome": self._generate_gut_microbiome(severity),
+                "epigenetics": self._generate_epigenetics(severity),
+                "wearables": self._generate_wearables(severity),
+                "cgm": self._generate_cgm(severity)
+            },
+            "user_query": self._generate_user_query(study, age, gender, severity),
+            "source_study": {
+                "pmid": study.get("pmid"),
+                "domain": domain,
+                "severity": severity,
+                "title": study.get("title")
+            }
+        }
+        return health_profile
+    def _generate_lab_values(self, domain: str, multiplier: float) -> Dict[str, Any]:
+        """Generate realistic lab values based on domain and severity"""
+        base_labs = {
+            "blood_tests": {
+                "systolic_bp": int(random.randint(120, 140) * multiplier),
+                "diastolic_bp": int(random.randint(70, 90) * multiplier),
+                "total_cholesterol": int(random.randint(180, 220) * multiplier),
+                "ldl": int(random.randint(100, 140) * multiplier),
+                "hdl": int(random.randint(40, 60) / multiplier),
+                "triglycerides": int(random.randint(80, 150) * multiplier),
+                "apoB": int(random.randint(70, 110) * multiplier),
+                "lp_a": random.randint(10, 50)
+            },
+            "inflammatory": {
+                "hscrp": round(random.uniform(1.0, 4.0) * multiplier, 1),
+                "esr": int(random.randint(5, 25) * multiplier),
+                "il6": round(random.uniform(1.0, 5.0) * multiplier, 1),
+                "tnf_alpha": round(random.uniform(1.0, 3.0) * multiplier, 1),
+                "oxidative_stress_markers": "elevated" if multiplier > 1.3 else "normal",
+                "homocysteine": round(random.uniform(8, 15) * multiplier, 1)
+            },
+            "nutritional": {
+                "vitamin_d": int(random.randint(25, 50) / multiplier),
+                "b12": random.randint(250, 400),
+                "folate": round(random.uniform(6, 14), 1),
+                "iron": random.randint(60, 120),
+                "ferritin": random.randint(30, 100),
+                "selenium": random.randint(80, 120),
+                "zinc": random.randint(70, 110),
+                "magnesium": round(random.uniform(1.5, 2.2), 1),
+                "omega3_index": round(random.uniform(4, 8) / multiplier, 1)
+            }
+        }
+        if domain == "metabolic_health":
+            base_labs["metabolic"] = {
+                "fasting_glucose": int(random.randint(85, 110) * multiplier),
+                "hba1c": round(random.uniform(5.2, 6.0) * min(multiplier, 1.4), 1),
+                "insulin_fasting": round(random.uniform(5, 15) * multiplier, 1),
+                "homa_ir": round(random.uniform(1.5, 4.0) * multiplier, 1)
+            }
+        return base_labs
+    def _generate_gut_microbiome(self, severity: str) -> str:
+        scores = {
+            "optimal": random.uniform(8.5, 9.5),
+            "mild": random.uniform(7.0, 8.5),
+            "moderate": random.uniform(5.5, 7.0),
+            "severe": random.uniform(3.5, 5.5)
+        }
+        score = scores.get(severity, 6.5)
+        descriptions = {
+            "optimal": "excellent diversity with optimal bacterial balance",
+            "mild": "good diversity with minor imbalances",
+            "moderate": "moderate dysbiosis with reduced beneficial bacteria",
+            "severe": "significant dysbiosis with pathogenic overgrowth"
+        }
+        desc = descriptions.get(severity, "moderate dysbiosis")
+        return f"Diversity score {score:.1f}/10, {desc}, beneficial bacteria {random.randint(60, 90)}%"
+    def _generate_epigenetics(self, severity: str) -> str:
+        age_acceleration = {
+            "optimal": random.randint(-2, 1),
+            "mild": random.randint(1, 3),
+            "moderate": random.randint(3, 6),
+            "severe": random.randint(6, 12)
+        }
+        acceleration = age_acceleration.get(severity, 4)
+        telomere_percentile = max(10, random.randint(30, 80) - acceleration * 5)
+        return f"Biological age acceleration: {acceleration} years, telomere length: {telomere_percentile}th percentile, DunedinPACE: {round(random.uniform(0.9, 1.4), 2)}"
+    def _generate_wearables(self, severity: str) -> Dict[str, int]:
+        base_ranges = {
+            "optimal": {"hrv": (55, 75), "rhr": (45, 60), "sleep": (85, 95)},
+            "mild": {"hrv": (45, 65), "rhr": (55, 70), "sleep": (75, 85)},
+            "moderate": {"hrv": (30, 50), "rhr": (65, 80), "sleep": (60, 75)},
+            "severe": {"hrv": (20, 35), "rhr": (75, 95), "sleep": (45, 65)}
+        }
+        ranges = base_ranges.get(severity, base_ranges["moderate"])
+        return {
+            "hrv_avg": random.randint(*ranges["hrv"]),
+            "rhr": random.randint(*ranges["rhr"]),
+            "sleep_score": random.randint(*ranges["sleep"]),
+            "recovery_score": random.randint(ranges["sleep"][0]-10, ranges["sleep"][1]-5),
+            "stress_score": random.randint(100-ranges["sleep"][1], 100-ranges["sleep"][0]+20),
+            "vo2_max": random.randint(25, 50),
+            "fitness_age": random.randint(30, 65)
+        }
+    def _generate_cgm(self, severity: str) -> str:
+        glucose_ranges = {
+            "optimal": (80, 95, 92, 98),
+            "mild": (85, 105, 85, 95),
+            "moderate": (95, 120, 70, 85),
+            "severe": (110, 140, 55, 75)
+        }
+        avg_min, avg_max, tir_min, tir_max = glucose_ranges.get(severity, glucose_ranges["moderate"])
+        return f"Average glucose {random.randint(avg_min, avg_max)} mg/dL, time in range {random.randint(tir_min, tir_max)}%"
+    def _generate_user_query(self, study: Dict[str, Any], age: int, gender: str, severity: str) -> str:
+        domain = study.get("domain", "longevity")
+        base_queries = {
+            "longevity": f"I'm a {age}-year-old {gender} interested in longevity optimization and anti-aging protocols",
+            "metabolic_health": f"I'm a {age}-year-old {gender} with metabolic dysfunction seeking evidence-based glucose control",
+            "cardiovascular": f"I'm a {age}-year-old {gender} with cardiovascular risk factors wanting heart health optimization",
+            "cognitive": f"I'm a {age}-year-old {gender} seeking cognitive enhancement and brain health optimization",
+            "hormonal": f"I'm a {age}-year-old {gender} with hormonal imbalances needing optimization protocols",
+            "inflammation": f"I'm a {age}-year-old {gender} with chronic inflammation seeking anti-inflammatory interventions"
+        }
+        base_query = base_queries.get(domain, base_queries["longevity"])
+        severity_context = {
+            "optimal": "I have excellent baseline health but want to push the boundaries of optimization",
+            "mild": "I have minor health concerns and want targeted interventions",
+            "moderate": "I have noticeable health issues and need comprehensive protocols",
+            "severe": "I have significant health challenges and require intensive interventions"
+        }
+        context = severity_context.get(severity, "")
+        return f"{base_query}. {context}."
+class AIProtocolGenerator:
+    """Uses OpenAI to generate health optimization protocols"""
+    def __init__(self, api_key: str, model: str = "gpt-4"):
+        self.client = OpenAI(api_key=api_key)
+        self.model = model
+        self.total_cost = 0.0
+    def generate_protocol(self, health_profile: Dict[str, Any], study_context: Dict[str, Any], progress_callback=None) -> Optional[str]:
+        """Generate comprehensive health optimization protocol"""
+        system_prompt = self._create_system_prompt(study_context)
+        user_prompt = self._create_user_prompt(health_profile, study_context)
+        try:
+            if progress_callback:
+                progress_callback(f"🔄 Generating protocol using {self.model}...")
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt}
+                ],
+                max_tokens=4000,
+                temperature=0.7,
+                top_p=0.9
+            )
+            self._update_cost(response.usage)
+            if progress_callback:
+                progress_callback(f"✅ Protocol generated ({response.usage.total_tokens} tokens)")
+            return response.choices[0].message.content
+        except Exception as e:
+            if progress_callback:
+                progress_callback(f"❌ Error generating protocol: {e}")
+            return None
+    def _create_system_prompt(self, study_context: Dict[str, Any]) -> str:
+        domain = study_context.get("domain", "health")
+        interventions = ", ".join(study_context.get("interventions", []))
+        return f"""You are an advanced AI health optimization system specializing in evidence-based medicine and personalized protocols.
+RESEARCH CONTEXT:
+- Domain: {domain} optimization
+- Key Interventions: {interventions}
+- Evidence Level: Peer-reviewed clinical research
+PROTOCOL REQUIREMENTS:
+1. Executive Summary with current health assessment
+2. Multi-Phase Protocol:
+   - Phase 1: Foundation (0-3 months)
+   - Phase 2: Optimization (3-6 months)
+   - Phase 3: Advanced Enhancement (6-12 months)
+3. Specific supplement protocols with dosages and timing
+4. Lifestyle interventions (exercise, nutrition, sleep)
+5. Monitoring and assessment plans
+6. Expected outcomes with realistic timelines
+STYLE: Professional, authoritative, using Medicine 3.0 terminology. Reference biological age, biomarkers, and cellular health.
+SAFETY: Keep dosages within evidence-based safe ranges. Include monitoring recommendations.
+Generate comprehensive protocols (3000+ words) with actionable precision medicine recommendations."""
+    def _create_user_prompt(self, health_profile: Dict[str, Any], study_context: Dict[str, Any]) -> str:
+        return f"""
+COMPREHENSIVE HEALTH OPTIMIZATION REQUEST:
+Health Profile Analysis:
+{json.dumps(health_profile, indent=2)}
+Research Context:
+- Study: {study_context.get('title', 'Health Optimization Study')}
+- Domain: {study_context.get('domain', 'general health')}
+- Key Findings: Based on clinical research showing significant improvements in health biomarkers
+Please analyze this health profile and generate a detailed, personalized optimization protocol. Address the specific biomarker patterns, deficiencies, and health challenges identified in the data. Provide targeted interventions with precise dosing, timing, and monitoring protocols.
+"""
+    def _update_cost(self, usage):
+        pricing = {
+            "gpt-3.5-turbo": {"input": 0.0015, "output": 0.002},
+            "gpt-4": {"input": 0.03, "output": 0.06},
+            "gpt-4-turbo": {"input": 0.01, "output": 0.03}
+        }
+        model_pricing = pricing.get(self.model, pricing["gpt-4"])
+        input_cost = usage.prompt_tokens * model_pricing["input"] / 1000
+        output_cost = usage.completion_tokens * model_pricing["output"] / 1000
+        self.total_cost += input_cost + output_cost
+class HealthDatasetGenerator:
+    """Complete system that orchestrates the entire dataset generation process"""
+    def __init__(self, api_key: str, model: str = "gpt-4"):
+        self.literature_sim = MedicalLiteratureSimulator()
+        self.profile_gen = HealthProfileGenerator()
+        self.protocol_gen = AIProtocolGenerator(api_key, model)
+        self.generated_examples = []
+    def generate_dataset(self,
+                        domains: List[str] = None,
+                        examples_per_domain: int = 2,
+                        rate_limit_delay: float = 2.0,
+                        progress_callback=None) -> Tuple[List[Dict[str, Any]], str]:
+        """Generate complete health optimization dataset with progress updates"""
+        if domains is None:
+            domains = ["longevity", "metabolic_health", "cardiovascular", "cognitive"]
+        if progress_callback:
+            progress_callback(f"🚀 Starting Health Dataset Generation")
+            progress_callback(f"Domains: {domains}")
+            progress_callback(f"Examples per domain: {examples_per_domain}")
+            progress_callback(f"Total examples to generate: {len(domains) * examples_per_domain}")
+        examples = []
+        total_examples = len(domains) * examples_per_domain
+        current_example = 0
+        for domain in domains:
+            if progress_callback:
+                progress_callback(f"\n📂 Processing domain: {domain}")
+            for i in range(examples_per_domain):
+                current_example += 1
+                try:
+                    if progress_callback:
+                        progress_callback(f"  Creating example {i+1}/{examples_per_domain} (Overall: {current_example}/{total_examples})")
+                    # Generate study data
+                    study = self.literature_sim.generate_study_data(domain)
+                    if progress_callback:
+                        progress_callback(f"    📚 Generated study: {study['title'][:50]}...")
+                    # Create health profile
+                    severity = random.choice(["mild", "moderate", "severe"])
+                    health_profile = self.profile_gen.generate_profile_from_study(study, severity)
+                    if progress_callback:
+                        progress_callback(f"    👤 Created {severity} health profile")
+                    # Generate protocol
+                    protocol = self.protocol_gen.generate_protocol(health_profile, study, progress_callback)
+                    if protocol:
+                        training_example = {
+                            "user_context": health_profile,
+                            "response": protocol,
+                            "citations": self._generate_citations(study),
+                            "metadata": {
+                                "domain": domain,
+                                "severity": severity,
+                                "study_pmid": study["pmid"],
+                                "generated_at": datetime.now().isoformat()
+                            }
+                        }
+                        examples.append(training_example)
+                        if progress_callback:
+                            progress_callback(f"    ✅ Complete example generated")
+                    # Rate limiting
+                    if i < examples_per_domain - 1:
+                        if progress_callback:
+                            progress_callback(f"    ⏳ Rate limit delay: {rate_limit_delay}s")
+                        time.sleep(rate_limit_delay)
+                except Exception as e:
+                    if progress_callback:
+                        progress_callback(f"    ❌ Error generating example: {e}")
+                    continue
+        if progress_callback:
+            progress_callback(f"\n🎉 Dataset generation complete!")
+            progress_callback(f"Generated: {len(examples)} examples")
+            progress_callback(f"Total cost: ${self.protocol_gen.total_cost:.4f}")
+        self.generated_examples = examples
+        return examples, f"Generated {len(examples)} examples. Total cost: ${self.protocol_gen.total_cost:.4f}"
+    def _generate_citations(self, study: Dict[str, Any]) -> Dict[str, List[str]]:
+        return {
+            "tier_1_peer_reviewed": [study["pmid"], f"PMC{random.randint(1000000, 9999999)}"],
+            "tier_2_rct": [f"{study['domain'].upper()}.2024.{random.randint(100000, 999999)}"],
+            "tier_3_cohort": [f"HEALTH.2023.{random.randint(100000, 999999)}"],
+            "real_world_cases": ["Evidence-based health optimization protocols"]
+        }
+    def export_dataset(self, filename: str = None) -> Tuple[str, List[str]]:
+        """Export dataset and return zip file path and file list"""
+        if not filename:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"health_dataset_{timestamp}"
+        # Create all files in memory
+        files_created = []
+        # Raw dataset
+        raw_data = json.dumps(self.generated_examples, indent=2, ensure_ascii=False)
+        files_created.append((f"{filename}.json", raw_data))
+        # Fine-tuning format
+        fine_tune_lines = []
+        for example in self.generated_examples:
+            fine_tune_example = {
+                "messages": [
+                    {
+                        "role": "system",
+                        "content": "You are an advanced AI health optimization system that creates evidence-based protocols."
+                    },
+                    {
+                        "role": "user",
+                        "content": f"Create a health optimization protocol for this profile:\n\n{json.dumps(example['user_context'], indent=2)}"
+                    },
+                    {
+                        "role": "assistant",
+                        "content": example["response"]
+                    }
+                ]
+            }
+            fine_tune_lines.append(json.dumps(fine_tune_example, ensure_ascii=False))
+        fine_tune_data = '\n'.join(fine_tune_lines)
+        files_created.append((f"{filename}_fine_tuning.jsonl", fine_tune_data))
+        # Sample examples
+        sample_size = min(3, len(self.generated_examples))
+        sample_data = json.dumps(self.generated_examples[:sample_size], indent=2, ensure_ascii=False)
+        files_created.append((f"{filename}_samples.json", sample_data))
+        # Metadata
+        metadata = {
+            "generation_info": {
+                "generated_at": datetime.now().isoformat(),
+                "total_examples": len(self.generated_examples),
+                "total_cost": self.protocol_gen.total_cost,
+                "model_used": self.protocol_gen.model
+            },
+            "domains_covered": list(set(ex["metadata"]["domain"] for ex in self.generated_examples)),
+            "severity_distribution": {
+                severity: sum(1 for ex in self.generated_examples if ex["metadata"]["severity"] == severity)
+                for severity in ["mild", "moderate", "severe"]
+            }
+        }
+        metadata_data = json.dumps(metadata, indent=2, ensure_ascii=False)
+        files_created.append((f"{filename}_metadata.json", metadata_data))
+        # Create zip file
+        zip_buffer = io.BytesIO()
+        with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+            for file_name, file_content in files_created:
+                zip_file.writestr(file_name, file_content)
+        # Save zip file
+        zip_filename = f"{filename}.zip"
+        with open(zip_filename, 'wb') as f:
+            f.write(zip_buffer.getvalue())
+        file_list = [f[0] for f in files_created]
+        return zip_filename, file_list
+# =====================================================================
+# STEP 3: GRADIO INTERFACE
+# =====================================================================
+class HealthDatasetGradioInterface:
+    """Gradio web interface for the health dataset generator"""
+    def __init__(self):
+        self.generator = None
+        self.available_domains = list(MedicalLiteratureSimulator().research_domains.keys())
+    def estimate_cost(self, domains, examples_per_domain, model):
+        """Estimate generation cost"""
+        if not domains:
+            return "Please select at least one domain"
+        total_examples = len(domains) * examples_per_domain
+        cost_per_example = {
+            "gpt-3.5-turbo": 0.05,
+            "gpt-4": 0.25,
+            "gpt-4-turbo": 0.15
+        }
+        estimated_cost = total_examples * cost_per_example.get(model, 0.25)
+        return f"💰 Estimated cost: ${estimated_cost:.2f} for {total_examples} examples"
+    def validate_inputs(self, api_key, domains, examples_per_domain):
+        """Validate user inputs"""
+        if not api_key or not api_key.strip():
+            return False, "❌ Please provide your OpenAI API key"
+        if not domains:
+            return False, "❌ Please select at least one domain"
+        if examples_per_domain < 1 or examples_per_domain > 10:
+            return False, "❌ Examples per domain must be between 1 and 10"
+        return True, "✅ Inputs are valid"
+    def generate_dataset_interface(self, api_key, domains, examples_per_domain, model, rate_limit):
+        """Main dataset generation function for Gradio interface"""
+        # Validate inputs
+        is_valid, message = self.validate_inputs(api_key, domains, examples_per_domain)
+        if not is_valid:
+            yield message, "", "", None, None
+            return
+        # Initialize generator
+        try:
+            self.generator = HealthDatasetGenerator(api_key.strip(), model)
+        except Exception as e:
+            yield f"❌ Error initializing generator: {e}", "", "", None, None
+            return
+        # Progress tracking
+        progress_messages = []
+        def progress_callback(message):
+            progress_messages.append(message)
+            progress_text = "\n".join(progress_messages[-20:])  # Keep last 20 messages
+            return progress_text
+        try:
+            # Generate dataset
+            yield "🚀 Starting dataset generation...", "", "", None, None
+            dataset, summary = self.generator.generate_dataset(
+                domains=domains,
+                examples_per_domain=examples_per_domain,
+                rate_limit_delay=rate_limit,
+                progress_callback=progress_callback
+            )
+            if not dataset:
+                yield "❌ No examples generated", "", "", None, None
+                return
+            # Export dataset
+            progress_callback("💾 Exporting dataset...")
+            zip_filename, file_list = self.generator.export_dataset()
+            # Create preview
+            preview = self.create_dataset_preview(dataset)
+            # Final progress
+            final_progress = progress_callback(f"🎉 Generation complete! Files: {', '.join(file_list)}")
+            yield final_progress, summary, preview, zip_filename, file_list
+        except Exception as e:
+            yield f"❌ Error during generation: {e}", "", "", None, None
+    def create_dataset_preview(self, dataset):
+        """Create a preview of the generated dataset"""
+        if not dataset:
+            return "No data to preview"
+        preview = "📄 **Dataset Preview**\n\n"
+        # Summary statistics
+        preview += f"**Total Examples:** {len(dataset)}\n"
+        # Domain distribution
+        domains = [ex['metadata']['domain'] for ex in dataset]
+        domain_counts = {d: domains.count(d) for d in set(domains)}
+        preview += f"**Domain Distribution:** {domain_counts}\n"
+        # Severity distribution
+        severities = [ex['metadata']['severity'] for ex in dataset]
+        severity_counts = {s: severities.count(s) for s in set(severities)}
+        preview += f"**Severity Distribution:** {severity_counts}\n\n"
+        # Sample example
+        if dataset:
+            example = dataset[0]
+            preview += "**Sample Example:**\n"
+            preview += f"- **Domain:** {example['metadata']['domain']}\n"
+            preview += f"- **Severity:** {example['metadata']['severity']}\n"
+            preview += f"- **User Query:** {example['user_context']['user_query'][:150]}...\n"
+            preview += f"- **Response Length:** {len(example['response'])} characters\n"
+            preview += f"- **PMID:** {example['metadata']['study_pmid']}\n"
+        return preview
+    def analyze_dataset_file(self, zip_file):
+        """Analyze uploaded dataset file"""
+        if zip_file is None:
+            return "No file uploaded"
+        try:
+            # Read the zip file
+            with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
+                # Look for the main dataset file
+                json_files = [f for f in zip_ref.namelist() if f.endswith('.json') and not f.endswith('_samples.json') and not f.endswith('_metadata.json')]
+                if json_files:
+                    dataset_file = json_files[0]
+                    with zip_ref.open(dataset_file) as f:
+                        dataset = json.load(f)
+                    analysis = "📊 **Dataset Analysis**\n\n"
+                    analysis += f"**Total Examples:** {len(dataset)}\n"
+                    analysis += f"**Average Response Length:** {sum(len(ex['response']) for ex in dataset) / len(dataset):.0f} characters\n"
+                    # Quality checks
+                    long_responses = sum(1 for ex in dataset if len(ex['response']) > 2000)
+                    has_phases = sum(1 for ex in dataset if "Phase" in ex['response'])
+                    has_dosages = sum(1 for ex in dataset if re.search(r'\d+\s*mg', ex['response']))
+                    analysis += f"**Quality Metrics:**\n"
+                    analysis += f"- Responses >2000 chars: {long_responses}/{len(dataset)} ({long_responses/len(dataset)*100:.1f}%)\n"
+                    analysis += f"- Responses with phases: {has_phases}/{len(dataset)} ({has_phases/len(dataset)*100:.1f}%)\n"
+                    analysis += f"- Responses with dosages: {has_dosages}/{len(dataset)} ({has_dosages/len(dataset)*100:.1f}%)\n"
+                    return analysis
+                else:
+                    return "No dataset JSON file found in zip"
+        except Exception as e:
+            return f"Error analyzing file: {e}"
+    def create_interface(self):
+        """Create the Gradio interface"""
+        with gr.Blocks(title="Medical Literature Health Dataset Generator", theme=gr.themes.Soft()) as interface:
+            gr.Markdown("""
+            # 🏥 Medical Literature Health Dataset Generator
+            This tool generates synthetic health optimization datasets based on medical literature patterns.
+            Perfect for training AI models on evidence-based health protocols.
+            ⚠️ **Important:** Generated content is for research/educational purposes only. Not medical advice.
+            """)
+            with gr.Tab("📊 Generate Dataset"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.Markdown("### ⚙️ Configuration")
+                        api_key = gr.Textbox(
+                            label="OpenAI API Key",
+                            placeholder="sk-...",
+                            type="password",
+                            info="Your OpenAI API key for generating protocols"
+                        )
+                        domains = gr.CheckboxGroup(
+                            label="Research Domains",
+                            choices=self.available_domains,
+                            value=["longevity", "metabolic_health"],
+                            info="Select medical research domains to include"
+                        )
+                        examples_per_domain = gr.Slider(
+                            label="Examples per Domain",
+                            minimum=1,
+                            maximum=10,
+                            value=2,
+                            step=1,
+                            info="Number of examples to generate for each domain"
+                        )
+                        model = gr.Dropdown(
+                            label="OpenAI Model",
+                            choices=["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo"],
+                            value="gpt-4",
+                            info="Model for generating protocols (GPT-4 recommended for quality)"
+                        )
+                        rate_limit = gr.Slider(
+                            label="Rate Limit Delay (seconds)",
+                            minimum=0.5,
+                            maximum=5.0,
+                            value=2.0,
+                            step=0.5,
+                            info="Delay between API calls to avoid rate limits"
+                        )
+                        cost_estimate = gr.Textbox(
+                            label="Cost Estimate",
+                            value="Select domains and examples to see estimate",
+                            interactive=False
+                        )
+                        generate_btn = gr.Button(
+                            "🚀 Generate Dataset",
+                            variant="primary",
+                            size="lg"
+                        )
+                    with gr.Column(scale=2):
+                        gr.Markdown("### 📈 Progress & Results")
+                        progress_output = gr.Textbox(
+                            label="Generation Progress",
+                            lines=15,
+                            max_lines=20,
+                            value="Ready to generate dataset...",
+                            interactive=False
+                        )
+                        summary_output = gr.Textbox(
+                            label="Generation Summary",
+                            lines=3,
+                            interactive=False
+                        )
+                        preview_output = gr.Markdown(
+                            label="Dataset Preview",
+                            value="Dataset preview will appear here..."
+                        )
+                with gr.Row():
+                    download_file = gr.File(
+                        label="📥 Download Generated Dataset",
+                        interactive=False
+                    )
+                    file_list = gr.Textbox(
+                        label="Generated Files",
+                        placeholder="Files included in download will be listed here",
+                        interactive=False
+                    )
+            with gr.Tab("📊 Analyze Dataset"):
+                gr.Markdown("### 📋 Dataset Analysis")
+                gr.Markdown("Upload a generated dataset zip file to analyze its quality and structure.")
+                with gr.Row():
+                    with gr.Column():
+                        upload_file = gr.File(
+                            label="Upload Dataset Zip File",
+                            file_types=[".zip"]
+                        )
+                        analyze_btn = gr.Button(
+                            "🔍 Analyze Dataset",
+                            variant="secondary"
+                        )
+                    with gr.Column():
+                        analysis_output = gr.Markdown(
+                            label="Analysis Results",
+                            value="Upload a dataset file to see analysis..."
+                        )
+            with gr.Tab("ℹ️ Information"):
+                gr.Markdown("""
+                ### 📚 How It Works
+                1. **Literature Simulation**: Creates realistic medical studies with proper abstracts, interventions, and outcomes
+                2. **Health Profile Generation**: Generates comprehensive health profiles based on study domains and severity levels
+                3. **AI Protocol Generation**: Uses OpenAI to create detailed health optimization protocols
+                4. **Dataset Export**: Outputs data in multiple formats including OpenAI fine-tuning format
+                ### 🎯 Output Files
+                - **`dataset.json`**: Complete raw dataset
+                - **`dataset_fine_tuning.jsonl`**: OpenAI fine-tuning format
+                - **`dataset_samples.json`**: Sample examples for review
+                - **`dataset_metadata.json`**: Generation statistics and info
+                ### 💰 Cost Information
+                - **GPT-3.5-turbo**: ~$0.05 per example
+                - **GPT-4**: ~$0.25 per example
+                - **GPT-4-turbo**: ~$0.15 per example
+                ### ⚠️ Important Notes
+                - Generated content is for **research/educational purposes only**
+                - **Not medical advice** - always consult healthcare professionals
+                - Include appropriate medical disclaimers when using generated content
+                - Review sample outputs before using in production
+                ### 🔧 Recommended Settings
+                - **Start small**: Generate 2-4 examples first to test quality
+                - **Use GPT-4**: Better quality than GPT-3.5-turbo
+                - **Rate limiting**: Use 2+ second delays to avoid API limits
+                - **Multiple domains**: Include diverse domains for comprehensive dataset
+                """)
+            # Event handlers
+            # Update cost estimate when inputs change
+            def update_cost_estimate(domains, examples_per_domain, model):
+                return self.estimate_cost(domains, examples_per_domain, model)
+            for input_component in [domains, examples_per_domain, model]:
+                input_component.change(
+                    fn=update_cost_estimate,
+                    inputs=[domains, examples_per_domain, model],
+                    outputs=[cost_estimate]
+                )
+            # Generate dataset
+            generate_btn.click(
+                fn=self.generate_dataset_interface,
+                inputs=[api_key, domains, examples_per_domain, model, rate_limit],
+                outputs=[progress_output, summary_output, preview_output, download_file, file_list]
+            )
+            # Analyze dataset
+            analyze_btn.click(
+                fn=self.analyze_dataset_file,
+                inputs=[upload_file],
+                outputs=[analysis_output]
+            )
+        return interface
+# =====================================================================
+# STEP 4: LAUNCH THE INTERFACE
+# =====================================================================
+def main():
+    """Launch the Gradio interface"""
+    print("🚀 Launching Medical Literature Health Dataset Generator")
+    print("This will start a web interface accessible through your browser")
+    # Create interface
+    interface_creator = HealthDatasetGradioInterface()
+    interface = interface_creator.create_interface()
+    # Launch with configuration
+    interface.launch(
+        share=True,  # Creates public link for sharing
+        server_name="0.0.0.0",  # Makes it accessible from other devices
+        server_port=7860,  # Default Gradio port
+        show_error=True,  # Show detailed errors
+        quiet=False  # Show startup info
+    )
+if __name__ == "__main__":
+    main()
+# For Google Colab, uncomment the following:
+# main()