Spaces:

charanKompala
/

business-analyst-gpt

Sleeping

App Files Files Community

charanKompala commited on May 27

Commit

8081e42

verified ·

1 Parent(s): 1cee08b

Update app.py

Browse files

Files changed (1) hide show

app.py +528 -746

app.py CHANGED Viewed

@@ -1,768 +1,550 @@
 import gradio as gr
-import json
-from datetime import datetime
-import os
-# Business Analyst templates and responses
-BA_TEMPLATES = {
-    "requirements": """
-# Business Requirements Document
-## Project Overview
-**Project:** {project_name}
-**Date:** {date}
-**Business Analyst:** {analyst_name}
-## Executive Summary
-{executive_summary}
-## Business Objectives
-{objectives}
-## Functional Requirements
-{functional_requirements}
-## Non-Functional Requirements
-{non_functional_requirements}
-## Acceptance Criteria
-{acceptance_criteria}
-## Assumptions and Dependencies
-{assumptions}
-## Risks and Mitigation
-{risks}
-""",
-    "user_story": """
-# User Story
-**As a** {user_type}
-**I want** {want}
-**So that** {benefit}
-## Acceptance Criteria
-{acceptance_criteria}
-## Definition of Done
-{definition_of_done}
-## Priority:** {priority}
-**Story Points:** {story_points}
-""",
-    "process_flow": """
-# Business Process Flow
-## Process Name: {process_name}
-## Process Overview
-{overview}
-## Process Steps:
-{steps}
-## Stakeholders Involved:
-{stakeholders}
-## Systems/Tools Used:
-{systems}
-## Key Performance Indicators (KPIs):
-{kpis}
-## Process Improvements:
-{improvements}
-""",
-    "gap_analysis": """
-# Gap Analysis Report
-## Current State Analysis
-{current_state}
-## Future State Vision
-{future_state}
-## Identified Gaps
-{gaps}
-## Impact Assessment
-{impact}
-## Recommendations
-{recommendations}
-## Implementation Roadmap
-{roadmap}
-""",
-    "stakeholder_analysis": """
-# Stakeholder Analysis
-## Project: {project_name}
-## Stakeholder Matrix
-### High Influence, High Interest (Manage Closely)
-{high_high}
-### High Influence, Low Interest (Keep Satisfied)
-{high_low}
-### Low Influence, High Interest (Keep Informed)
-{low_high}
-### Low Influence, Low Interest (Monitor)
-{low_low}
-## Communication Plan
-{communication_plan}
-## Engagement Strategy
-{engagement_strategy}
-""",
-    "data_analysis": """
-# Data Analysis Report
-## Dataset Overview
-**Dataset Name:** {dataset_name}
-**Shape:** {shape}
-**Upload Date:** {date}
-## Column Analysis
-{column_analysis}
-## Machine Learning Model Recommendations
-### 🤖 Suitable ML Models:
-{ml_recommendations}
-### 📊 Variable Identification:
-**Potential Dependent Variables (Target):**
-{dependent_vars}
-**Potential Independent Variables (Features):**
-{independent_vars}
-### 📈 Data Visualization Recommendations:
-{viz_recommendations}
-## Data Quality Assessment
-{data_quality}
-## Next Steps
-{next_steps}
 """
-}
-def analyze_dataset(file):
-    """Analyze uploaded dataset and provide ML recommendations"""
-    if file is None:
-        return "Please upload a dataset file (CSV format)"
-    try:
-        # Read the CSV file content
-        with open(file, 'r', encoding='utf-8') as f:
-            content = f.read()
-        # Parse CSV manually (simple approach)
-        lines = content.strip().split('\n')
-        if len(lines) < 2:
-            return "Dataset appears to be empty or invalid"
-        # Get headers
-        headers = [col.strip().strip('"') for col in lines[0].split(',')]
-        data_rows = lines[1:]
-        # Basic dataset info
-        dataset_name = os.path.basename(file)
-        shape = f"{len(data_rows)} rows × {len(headers)} columns"
-        # Analyze columns
-        column_analysis = analyze_columns_simple(headers, data_rows)
-        # ML model recommendations
-        ml_recommendations = recommend_ml_models_simple(headers, data_rows)
-        # Variable identification
-        dependent_vars, independent_vars = identify_variables_simple(headers, data_rows)
-        # Visualization recommendations
-        viz_recommendations = recommend_visualizations_simple(headers, data_rows)
-        # Data quality assessment
-        data_quality = assess_data_quality_simple(headers, data_rows)
-        # Next steps
-        next_steps = generate_next_steps_simple()
-        # Generate report
-        template = BA_TEMPLATES["data_analysis"]
-        return template.format(
-            dataset_name=dataset_name,
-            shape=shape,
-            date=datetime.now().strftime("%Y-%m-%d"),
-            column_analysis=column_analysis,
-            ml_recommendations=ml_recommendations,
-            dependent_vars=dependent_vars,
-            independent_vars=independent_vars,
-            viz_recommendations=viz_recommendations,
-            data_quality=data_quality,
-            next_steps=next_steps
-        )
-    except Exception as e:
-        return f"Error analyzing dataset: {str(e)}. Please ensure your file is a valid CSV format."
-def analyze_columns_simple(headers, data_rows):
-    """Simple column analysis without pandas"""
-    analysis = []
-    for i, col in enumerate(headers):
-        # Get sample values for this column
-        values = []
-        for row in data_rows[:100]:  # Sample first 100 rows
-            row_data = [cell.strip().strip('"') for cell in row.split(',')]
-            if i < len(row_data):
-                values.append(row_data[i])
-        # Basic analysis
-        non_empty_values = [v for v in values if v and v != '']
-        unique_values = len(set(non_empty_values))
-        missing_count = len(values) - len(non_empty_values)
-        # Determine data type
-        is_numeric = all(is_number(v) for v in non_empty_values[:10] if v)
-        if is_numeric:
-            col_type = "Numerical"
-            numeric_vals = [float(v) for v in non_empty_values if is_number(v)]
-            if numeric_vals:
-                stats = f"Range: {min(numeric_vals)} to {max(numeric_vals)}, Avg: {sum(numeric_vals)/len(numeric_vals):.2f}"
-            else:
-                stats = "No valid numeric data"
         else:
-            col_type = "Categorical/Text"
-            common_values = list(set(non_empty_values[:10]))[:3]
-            stats = f"Sample values: {common_values}"
-        missing_pct = round((missing_count / len(values)) * 100, 1) if values else 0
-        analysis.append(f"**{col}** ({col_type}): {unique_values} unique values, {missing_pct}% missing\n   └─ {stats}")
-    return "\n\n".join(analysis)
-def is_number(s):
-    """Check if string represents a number"""
-    try:
-        float(s)
-        return True
-    except (ValueError, TypeError):
-        return False
-def recommend_ml_models_simple(headers, data_rows):
-    """Simple ML model recommendations"""
-    recommendations = []
-    # Count estimated numerical and categorical columns
-    numerical_cols = []
-    categorical_cols = []
-    for i, col in enumerate(headers):
-        # Sample some values to determine type
-        sample_values = []
-        for row in data_rows[:50]:
-            row_data = [cell.strip().strip('"') for cell in row.split(',')]
-            if i < len(row_data) and row_data[i]:
-                sample_values.append(row_data[i])
-        if sample_values:
-            numeric_count = sum(1 for v in sample_values if is_number(v))
-            if numeric_count > len(sample_values) * 0.7:  # 70% numeric
-                numerical_cols.append(col)
             else:
-                categorical_cols.append(col)
-    total_rows = len(data_rows)
-    # Classification models recommendation
-    if categorical_cols:
-        recommendations.append("""
-**🎯 Classification Models** (Suitable for predicting categories)
-• **Logistic Regression** - Good for binary classification, interpretable results
-• **Random Forest** - Handles mixed data types well, provides feature importance
-• **Decision Tree** - Easy to interpret, good for creating business rules
-• **Support Vector Machine (SVM)** - Effective for high-dimensional data
-• **Gradient Boosting (XGBoost)** - High accuracy, handles missing values well
-• **Neural Networks** - For complex patterns (recommended if dataset > 1000 rows)
-*Best for: Predicting categories like Yes/No, High/Medium/Low, Customer segments*""")
-    # Regression models recommendation
-    if numerical_cols:
-        recommendations.append("""
-**📈 Regression Models** (Suitable for predicting continuous values)
-• **Linear Regression** - Simple, interpretable, good as baseline model
-• **Polynomial Regression** - For capturing non-linear relationships
-• **Random Forest Regressor** - Robust to outliers, handles mixed data types
-• **Ridge/Lasso Regression** - Good for high-dimensional data, prevents overfitting
-• **Gradient Boosting Regressor** - High accuracy for complex patterns
-• **Neural Networks** - For complex non-linear relationships
-*Best for: Predicting prices, quantities, scores, measurements, forecasts*""")
-    # Clustering models
-    if total_rows > 50:
-        recommendations.append("""
-**🔍 Clustering Models** (Suitable for finding hidden patterns)
-• **K-Means Clustering** - Good for customer segmentation, market analysis
-• **Hierarchical Clustering** - Creates tree-like cluster structure
-• **DBSCAN** - Finds clusters of varying shapes and sizes, handles noise
-*Best for: Customer segmentation, market analysis, pattern discovery*""")
-    # Time series recommendation
-    date_keywords = ['date', 'time', 'year', 'month', 'day', 'timestamp']
-    has_date_column = any(keyword in col.lower() for col in headers for keyword in date_keywords)
-    if has_date_column:
-        recommendations.append("""
-**⏰ Time Series Models** (Suitable for temporal data)
-• **ARIMA** - Classical time series forecasting
-• **Prophet** - Good for seasonal patterns and holidays
-• **LSTM Neural Networks** - For complex temporal patterns
-• **Exponential Smoothing** - Simple but effective for trends
-*Best for: Sales forecasting, demand prediction, trend analysis*""")
-    # Dataset size considerations
-    if total_rows < 100:
-        recommendations.append("""
-**⚠️ Dataset Size Consideration:**
-Your dataset is small (< 100 rows). Consider:
-• Simple models like Linear/Logistic Regression
-• Decision Trees with limited depth to avoid overfitting
-• Collecting more data for better model performance
-• Cross-validation for reliable performance estimates""")
-    elif total_rows > 10000:
-        recommendations.append("""
-**🚀 Large Dataset Advantages:**
-Your dataset is large (> 10,000 rows). You can use:
-• Complex models like Neural Networks and Deep Learning
-• Ensemble methods for higher accuracy
-• Advanced feature engineering techniques
-• Multiple model comparison and stacking""")
-    return "\n".join(recommendations) if recommendations else "Unable to determine suitable models. Please check your dataset format."
-def identify_variables_simple(headers, data_rows):
-    """Simple variable identification"""
-    dependent_candidates = []
-    independent_candidates = []
-    # Look for potential target variables
-    target_keywords = ['target', 'label', 'class', 'outcome', 'result', 'prediction', 'y']
-    for col in headers:
-        col_lower = col.lower()
-        # Check if column name suggests it's a target
-        if any(keyword in col_lower for keyword in target_keywords):
-            dependent_candidates.append(f"• **{col}** - Column name suggests this is a target variable")
-            continue
-        # Sample values to determine if categorical with few categories
-        sample_values = []
-        for row in data_rows[:100]:
-            row_data = [cell.strip().strip('"') for cell in row.split(',')]
-            col_idx = headers.index(col)
-            if col_idx < len(row_data) and row_data[col_idx]:
-                sample_values.append(row_data[col_idx])
-        unique_values = len(set(sample_values))
-        # Potential categorical target (few unique values)
-        if unique_values <= 10 and len(sample_values) > 0:
-            sample_unique = list(set(sample_values))[:5]
-            dependent_candidates.append(f"• **{col}** - Categorical with {unique_values} categories: {sample_unique}")
-        # Potential numerical target
-        elif is_number(sample_values[0]) if sample_values else False:
-            if any(keyword in col_lower for keyword in ['price', 'amount', 'score', 'rating', 'value']):
-                dependent_candidates.append(f"• **{col}** - Numerical variable suitable for regression")
-    # All other columns as independent variables
-    dep_var_names = [line.split('**')[1].split('**')[0] for line in dependent_candidates]
-    for col in headers:
-        if col not in dep_var_names:
-            # Determine type
-            sample_values = []
-            for row in data_rows[:50]:
-                row_data = [cell.strip().strip('"') for cell in row.split(',')]
-                col_idx = headers.index(col)
-                if col_idx < len(row_data) and row_data[col_idx]:
-                    sample_values.append(row_data[col_idx])
-            if sample_values:
-                is_numeric = all(is_number(v) for v in sample_values[:10])
-                col_type = "Numerical" if is_numeric else "Categorical"
-                unique_count = len(set(sample_values))
-                independent_candidates.append(f"• **{col}** ({col_type}) - {unique_count} unique values")
-    # Format output
-    dep_vars = "\n".join(dependent_candidates) if dependent_candidates else "• No clear target variables identified automatically.\n• Consider which variable you want to predict based on your business objective."
-    indep_vars = "\n".join(independent_candidates[:15]) if independent_candidates else "• All columns can potentially serve as features."
-    if len(independent_candidates) > 15:
-        indep_vars += f"\n• ... and {len(independent_candidates) - 15} more variables"
-    return dep_vars, indep_vars
-def recommend_visualizations_simple(headers, data_rows):
-    """Simple visualization recommendations"""
-    viz_recommendations = []
-    # Analyze column types
-    numerical_cols = []
-    categorical_cols = []
-    for col in headers:
-        # Sample values to determine type
-        sample_values = []
-        for row in data_rows[:50]:
-            row_data = [cell.strip().strip('"') for cell in row.split(',')]
-            col_idx = headers.index(col)
-            if col_idx < len(row_data) and row_data[col_idx]:
-                sample_values.append(row_data[col_idx])
-        if sample_values:
-            numeric_count = sum(1 for v in sample_values if is_number(v))
-            if numeric_count > len(sample_values) * 0.7:
-                numerical_cols.append(col)
-            else:
-                categorical_cols.append(col)
-    # Recommendations for numerical variables
-    if numerical_cols:
-        viz_recommendations.append("**📊 For Numerical Variables:**")
-        for col in numerical_cols[:5]:  # Limit to first 5
-            viz_recommendations.append(f"""
-• **{col}**:
-  - **Histogram** - Show distribution pattern and identify outliers
-  - **Box Plot** - Visualize quartiles, median, and outliers
-  - **Line Chart** - Show trends over time (if sequential data)
-  - **Scatter Plot** vs other numerical variables - Find correlations""")
-    # Recommendations for categorical variables
-    if categorical_cols:
-        viz_recommendations.append("\n**📈 For Categorical Variables:**")
-        for col in categorical_cols[:5]:  # Limit to first 5
-            # Count unique values
-            sample_values = []
-            for row in data_rows[:100]:
-                row_data = [cell.strip().strip('"') for cell in row.split(',')]
-                col_idx = headers.index(col)
-                if col_idx < len(row_data) and row_data[col_idx]:
-                    sample_values.append(row_data[col_idx])
-            unique_count = len(set(sample_values))
-            viz_recommendations.append(f"""
-• **{col}** ({unique_count} categories):
-  - **Bar Chart** - Compare frequency/count of each category
-  - **Pie Chart** - Show proportional breakdown (best if < 8 categories)
-  - **Donut Chart** - Modern alternative to pie chart
-  - **Horizontal Bar Chart** - Better for long category names""")
-    # Relationship visualizations
-    if len(numerical_cols) >= 2:
-        viz_recommendations.append(f"""
-**🔗 For Relationships Between Variables:**
-• **Correlation Heatmap** - Show relationships between all numerical variables
-• **Scatter Plot Matrix** - Compare pairs of numerical variables
-• **Pair Plot** - Detailed pairwise relationships with distributions""")
-    if numerical_cols and categorical_cols:
-        viz_recommendations.append(f"""
-• **Box Plot by Category** - Compare {numerical_cols[0]} distribution across {categorical_cols[0]} categories
-• **Violin Plot** - Show distribution shape across categories
-• **Bar Chart with Error Bars** - Show mean values with confidence intervals""")
-    # Advanced visualizations
-    if len(headers) >= 3:
-        viz_recommendations.append(f"""
-**🎯 Advanced Visualizations:**
-• **3D Scatter Plot** - Explore relationships between 3 variables
-• **Bubble Chart** - Show 3 dimensions using x, y, and bubble size
-• **Treemap** - For hierarchical categorical data
-• **Sunburst Chart** - For nested categorical relationships
-• **Parallel Coordinates** - For multivariate data exploration""")
-    # Dashboard recommendations
-    viz_recommendations.append(f"""
-**📋 Dashboard & Interactive Elements:**
-• **KPI Cards** - Show key metrics and summary statistics
-• **Filter Panels** - Allow users to slice and dice data
-• **Trend Lines** - Add to charts to highlight patterns
-• **Data Tables** - Show raw data with sorting and filtering
-• **Dropdown Selectors** - For choosing variables to visualize""")
-    return "\n".join(viz_recommendations)
-def assess_data_quality_simple(headers, data_rows):
-    """Simple data quality assessment"""
-    quality_issues = []
-    # Check for missing values
-    missing_analysis = []
-    for col in headers:
-        col_idx = headers.index(col)
-        missing_count = 0
-        total_count = 0
-        for row in data_rows:
-            row_data = [cell.strip().strip('"') for cell in row.split(',')]
-            total_count += 1
-            if col_idx >= len(row_data) or not row_data[col_idx] or row_data[col_idx] == '':
-                missing_count += 1
-        if missing_count > 0:
-            missing_pct = round((missing_count / total_count) * 100, 1)
-            missing_analysis.append(f"  - {col}: {missing_pct}% missing ({missing_count}/{total_count})")
-    if missing_analysis:
-        quality_issues.append("**Missing Values Detected:**")
-        quality_issues.extend(missing_analysis[:5])  # Show first 5
-        if len(missing_analysis) > 5:
-            quality_issues.append(f"  - ... and {len(missing_analysis) - 5} more columns with missing data")
-    # Check for potential duplicates (simple check)
-    unique_rows = set()
-    duplicate_count = 0
-    for row in data_rows[:1000]:  # Check first 1000 rows
-        if row in unique_rows:
-            duplicate_count += 1
-        else:
-            unique_rows.add(row)
-    if duplicate_count > 0:
-        quality_issues.append(f"**Potential Duplicate Rows:** {duplicate_count} duplicate rows detected in sample")
-    # Check for inconsistent data formats
-    format_issues = []
-    for col in headers[:5]:  # Check first 5 columns
-        col_idx = headers.index(col)
-        values = []
-        for row in data_rows[:100]:
-            row_data = [cell.strip().strip('"') for cell in row.split(',')]
-            if col_idx < len(row_data) and row_data[col_idx]:
-                values.append(row_data[col_idx])
-        if values:
-            # Check for mixed numeric/text in same column
-            numeric_count = sum(1 for v in values if is_number(v))
-            if 0 < numeric_count < len(values):
-                format_issues.append(f"  - {col}: Mixed data types (numeric and text)")
-    if format_issues:
-        quality_issues.append("**Data Format Issues:**")
-        quality_issues.extend(format_issues)
-    # Overall assessment
-    if not quality_issues:
-        return "✅ **Good Data Quality:** No major data quality issues detected in the sample."
-    else:
-        quality_issues.insert(0, "⚠️ **Data Quality Issues Found:**")
-        quality_issues.append("\n**Recommendations:**")
-        quality_issues.append("• Clean missing values before model training")
-        quality_issues.append("• Remove or handle duplicate records")
-        quality_issues.append("• Standardize data formats within columns")
-        quality_issues.append("• Validate data types and convert as needed")
-    return "\n".join(quality_issues)
-def generate_next_steps_simple():
-    """Generate recommended next steps"""
-    steps = [
-        "1. **Data Cleaning:** Handle missing values, duplicates, and outliers",
-        "2. **Data Exploration:** Create visualizations to understand patterns and relationships",
-        "3. **Feature Engineering:** Create new variables from existing ones if needed",
-        "4. **Variable Selection:** Choose the most relevant features for your model",
-        "5. **Model Selection:** Pick appropriate ML model based on recommendations above",
-        "6. **Data Splitting:** Divide data into training and testing sets (80/20 split)",
-        "7. **Model Training:** Train your selected model with the training data",
-        "8. **Model Evaluation:** Test model performance using appropriate metrics",
-        "9. **Model Tuning:** Optimize hyperparameters for better performance",
-        "10. **Model Deployment:** Implement the model for business use"
-    ]
-    return "\n".join(steps)
-def generate_ba_document(doc_type, user_input, dataset_file=None):
-    """Generate business analyst document based on type and input"""
-    # Handle dataset analysis
-    if doc_type == "data_analysis":
-        if dataset_file:
-            return analyze_dataset(dataset_file)
         else:
-            return "Please upload a dataset file (CSV format) to perform data analysis."
-    # Parse user input to extract key information
-    lines = user_input.strip().split('\n')
-    context = {}
-    # Extract context from user input
-    for line in lines:
-        if ':' in line:
-            key, value = line.split(':', 1)
-            context[key.strip().lower().replace(' ', '_')] = value.strip()
-    # Add default values
-    context['date'] = datetime.now().strftime("%Y-%m-%d")
-    context['analyst_name'] = context.get('analyst_name', 'Business Analyst')
-    if doc_type == "requirements":
-        return generate_requirements_doc(context, user_input)
-    elif doc_type == "user_story":
-        return generate_user_story(context, user_input)
-    elif doc_type == "process_flow":
-        return generate_process_flow(context, user_input)
-    elif doc_type == "gap_analysis":
-        return generate_gap_analysis(context, user_input)
-    elif doc_type == "stakeholder_analysis":
-        return generate_stakeholder_analysis(context, user_input)
-    else:
-        return "Please select a valid document type."
-def generate_requirements_doc(context, user_input):
-    """Generate Business Requirements Document"""
-    # Default values if not provided
-    project_name = context.get('project_name', context.get('project', 'New Project'))
-    # Generate content based on input
-    if 'objective' in user_input.lower() or 'goal' in user_input.lower():
-        objectives = extract_objectives(user_input)
-    else:
-        objectives = "• Improve business efficiency\n• Enhance user experience\n• Reduce operational costs"
-    if 'requirement' in user_input.lower():
-        functional_requirements = extract_requirements(user_input)
-    else:
-        functional_requirements = "• System shall allow users to [specific functionality]\n• System shall provide [specific feature]\n• System shall integrate with [existing systems]"
-    template = BA_TEMPLATES["requirements"]
-    return template.format(
-        project_name=project_name,
-        date=context['date'],
-        analyst_name=context['analyst_name'],
-        executive_summary=context.get('summary', 'This document outlines the business requirements for ' + project_name),
-        objectives=objectives,
-        functional_requirements=functional_requirements,
-        non_functional_requirements="• Performance: Response time < 2 seconds\n• Security: Role-based access control\n• Scalability: Support 1000+ concurrent users",
-        acceptance_criteria="• All functional requirements implemented\n• User acceptance testing completed\n• Performance benchmarks met",
-        assumptions="• Users have basic computer literacy\n• Integration APIs are available\n• Project timeline is 6 months",
-        risks="• Technical complexity may cause delays\n• User adoption challenges\n• Budget constraints"
-    )
-def generate_user_story(context, user_input):
-    """Generate User Story"""
-    # Extract user story components
-    user_type = context.get('user_type', context.get('as_a', 'end user'))
-    want = context.get('want', context.get('i_want', 'perform a specific action'))
-    benefit = context.get('benefit', context.get('so_that', 'achieve my goal efficiently'))
-    template = BA_TEMPLATES["user_story"]
-    return template.format(
-        user_type=user_type,
-        want=want,
-        benefit=benefit,
-        acceptance_criteria="• Given [precondition]\n• When [action]\n• Then [expected result]",
-        definition_of_done="• Code reviewed and approved\n• Unit tests written and passing\n• Documentation updated\n• Deployed to staging environment",
-        priority=context.get('priority', 'Medium'),
-        story_points=context.get('story_points', 'TBD')
-    )
-def generate_process_flow(context, user_input):
-    """Generate Process Flow Document"""
-    process_name = context.get('process_name', context.get('process', 'Business Process'))
-    template = BA_TEMPLATES["process_flow"]
-    return template.format(
-        process_name=process_name,
-        overview=context.get('overview', f'This document describes the {process_name} workflow and procedures.'),
-        steps="1. Process initiation\n2. Data collection\n3. Analysis and review\n4. Decision making\n5. Implementation\n6. Monitoring and feedback",
-        stakeholders="• Business Users\n• Process Owners\n• IT Support\n• Management",
-        systems="• CRM System\n• ERP System\n• Document Management\n• Reporting Tools",
-        kpis="• Process completion time\n• Error rate\n• Customer satisfaction\n• Cost per transaction",
-        improvements="• Automation opportunities\n• Bottleneck elimination\n• Quality enhancements\n• Cost reduction initiatives"
-    )
-def generate_gap_analysis(context, user_input):
-    """Generate Gap Analysis Report"""
-    template = BA_TEMPLATES["gap_analysis"]
-    return template.format(
-        current_state="Current business processes and systems analysis goes here...",
-        future_state="Desired future state and target operating model goes here...",
-        gaps="• Process inefficiencies\n• Technology limitations\n• Skill gaps\n• Resource constraints",
-        impact="• Operational impact assessment\n• Financial implications\n• Risk evaluation\n• Timeline considerations",
-        recommendations="• Short-term quick wins\n• Medium-term improvements\n• Long-term strategic initiatives\n• Resource allocation suggestions",
-        roadmap="Phase 1: Foundation (Months 1-3)\nPhase 2: Implementation (Months 4-8)\nPhase 3: Optimization (Months 9-12)"
-    )
-def generate_stakeholder_analysis(context, user_input):
-    """Generate Stakeholder Analysis"""
-    project_name = context.get('project_name', context.get('project', 'Current Project'))
-    template = BA_TEMPLATES["stakeholder_analysis"]
-    return template.format(
-        project_name=project_name,
-        high_high="• Executive Sponsor\n• Project Manager\n• Key Business Users",
-        high_low="• Senior Management\n• Department Heads\n• Regulatory Bodies",
-        low_high="• End Users\n• Customer Representatives\n• Support Teams",
-        low_low="• Vendors\n• External Consultants\n• Peripheral Teams",
-        communication_plan="• Weekly status reports\n• Monthly steering committee meetings\n• Quarterly business reviews",
-        engagement_strategy="• Regular one-on-one meetings\n• Focus groups and workshops\n• Change management activities"
-    )
-def extract_objectives(text):
-    """Extract objectives from user input"""
-    objectives = []
-    lines = text.split('\n')
-    for line in lines:
-        if any(keyword in line.lower() for keyword in ['objective', 'goal', 'aim', 'target']):
-            objectives.append(f"• {line.strip()}")
-    if not objectives:
-        return "• Improve business efficiency\n• Enhance user experience\n• Reduce operational costs"
-    return '\n'.join(objectives)
-def extract_requirements(text):
-    """Extract requirements from user input"""
-    requirements = []
-    lines = text.split('\n')
-    for line in lines:
-        if any(keyword in line.lower() for keyword in ['requirement', 'must', 'shall', 'should', 'need']):
-            requirements.append(f"• {line.strip()}")
-    if not requirements:
-        return "• System shall provide core functionality\n• System shall integrate with existing tools\n• System shall meet performance standards"
-    return '\n'.join(requirements)
-def ba_chat_response(message, history):
-    """Generate BA consultant response"""
-    ba_responses = {
-        "hello": "Hello! I'm your Business Analyst assistant. I can help you create requirements documents, user stories, process flows, gap analyses, stakeholder analyses, and analyze datasets for ML recommendations. What would you like to work on today?",
-        "help": "I can assist you with:\n• Business Requirements Documents\n• User Stories\n• Process Flow Documentation\n• Gap Analysis Reports\n• Stakeholder Analysis\n• Data Analysis with ML recommendations\n\nJust tell me what type of document you need and provide some details!",
-        "requirements": "To create a Business Requirements Document, please provide:\n• Project name\n• Business objectives\n• Key requirements\n• Stakeholders involved\n\nThen select 'Requirements Document' from the dropdown above.",
-        "user story": "For user stories, please provide:\n• User type (As a...)\n• What they want (I want...)\n• The benefit (So that...)\n• Priority level\n\nThen select 'User Story

 import gradio as gr
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.svm import SVC, SVR
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.naive_bayes import GaussianNB
+from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
+from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
+import matplotlib.pyplot as plt
+import seaborn as sns
+import io
+import base64
+import warnings
+warnings.filterwarnings('ignore')
+class BusinessAnalystGPT:
+    def __init__(self):
+        self.df = None
+        self.analysis_results = ""
+    def analyze_dataset(self, file):
+        """Analyze uploaded dataset and provide comprehensive insights"""
+        try:
+            # Read the dataset
+            if file.name.endswith('.csv'):
+                self.df = pd.read_csv(file.name)
+            elif file.name.endswith(('.xlsx', '.xls')):
+                self.df = pd.read_excel(file.name)
+            else:
+                return "Error: Please upload a CSV or Excel file."
+            # Basic dataset info
+            analysis = f"""
+# 📊 DATASET ANALYSIS REPORT
+## 📈 Basic Information
+- **Dataset Shape**: {self.df.shape[0]} rows × {self.df.shape[1]} columns
+- **Memory Usage**: {self.df.memory_usage(deep=True).sum() / 1024:.2f} KB
+- **Missing Values**: {self.df.isnull().sum().sum()} total
+## 📋 Column Information
 """
+            # Column details
+            for i, col in enumerate(self.df.columns):
+                dtype = str(self.df[col].dtype)
+                missing = self.df[col].isnull().sum()
+                unique_vals = self.df[col].nunique()
+                analysis += f"\n**{i+1}. {col}**\n"
+                analysis += f"   - Data Type: {dtype}\n"
+                analysis += f"   - Missing Values: {missing} ({missing/len(self.df)*100:.1f}%)\n"
+                analysis += f"   - Unique Values: {unique_vals}\n"
+                if dtype in ['int64', 'float64']:
+                    analysis += f"   - Range: {self.df[col].min():.2f} to {self.df[col].max():.2f}\n"
+                    analysis += f"   - Mean: {self.df[col].mean():.2f}\n"
+                elif dtype == 'object':
+                    top_values = self.df[col].value_counts().head(3)
+                    analysis += f"   - Top Values: {list(top_values.index)}\n"
+            # Add ML Model Recommendations
+            analysis += self._get_ml_recommendations()
+            # Add Visualization Recommendations
+            analysis += self._get_visualization_recommendations()
+            self.analysis_results = analysis
+            return analysis
+        except Exception as e:
+            return f"Error analyzing dataset: {str(e)}"
+    def _get_ml_recommendations(self):
+        """Analyze dataset and recommend suitable ML models with variable suggestions"""
+        if self.df is None:
+            return ""
+        ml_analysis = "\n\n## 🤖 MACHINE LEARNING MODEL RECOMMENDATIONS\n\n"
+        # Identify variable types
+        numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
+        categorical_cols = self.df.select_dtypes(include=['object']).columns.tolist()
+        ml_analysis += "### 🎯 Potential Target Variables (Dependent Variables):\n"
+        # Suggest target variables based on data characteristics
+        target_suggestions = []
+        for col in numeric_cols:
+            unique_ratio = self.df[col].nunique() / len(self.df)
+            if unique_ratio < 0.1 and self.df[col].nunique() <= 10:
+                target_suggestions.append((col, "Classification", f"Has {self.df[col].nunique()} unique values - good for classification"))
+            elif unique_ratio > 0.1:
+                target_suggestions.append((col, "Regression", "Continuous values - suitable for regression"))
+        for col in categorical_cols:
+            if self.df[col].nunique() <= 10:
+                target_suggestions.append((col, "Classification", f"Categorical with {self.df[col].nunique()} classes"))
+        if target_suggestions:
+            for var, task_type, reason in target_suggestions:
+                ml_analysis += f"- **{var}** ({task_type}): {reason}\n"
         else:
+            ml_analysis += "- No clear target variables identified. Please specify based on your business objective.\n"
+        ml_analysis += "\n### 📊 Feature Variables (Independent Variables):\n"
+        # List potential feature variables
+        all_cols = list(self.df.columns)
+        if len(numeric_cols) > 0:
+            ml_analysis += f"- **Numeric Features**: {', '.join(numeric_cols)}\n"
+        if len(categorical_cols) > 0:
+            ml_analysis += f"- **Categorical Features**: {', '.join(categorical_cols)}\n"
+        # Model recommendations based on data characteristics
+        ml_analysis += "\n### 🔮 Recommended Models & Expected Performance:\n\n"
+        # Classification models
+        if any("Classification" in suggestion[1] for suggestion in target_suggestions):
+            ml_analysis += "#### 🎯 For Classification Tasks:\n"
+            ml_analysis += """
+1. **Random Forest Classifier** ⭐⭐⭐⭐⭐
+   - Expected Accuracy: 85-95%
+   - Best for: Mixed data types, feature importance
+   - Pros: Handles missing values, no overfitting
+2. **Logistic Regression** ⭐⭐⭐⭐
+   - Expected Accuracy: 75-85%
+   - Best for: Linear relationships, interpretability
+   - Pros: Fast, interpretable coefficients
+3. **Decision Tree** ⭐⭐⭐
+   - Expected Accuracy: 70-80%
+   - Best for: Rule-based decisions, interpretability
+   - Pros: Easy to understand and visualize
+4. **Support Vector Machine (SVM)** ⭐⭐⭐⭐
+   - Expected Accuracy: 80-90%
+   - Best for: High-dimensional data, small datasets
+   - Pros: Effective for complex patterns
+5. **K-Nearest Neighbors (KNN)** ⭐⭐⭐
+   - Expected Accuracy: 70-85%
+   - Best for: Simple patterns, small datasets
+   - Pros: Simple, no assumptions about data
+"""
+        # Regression models
+        if any("Regression" in suggestion[1] for suggestion in target_suggestions):
+            ml_analysis += "\n#### 📈 For Regression Tasks:\n"
+            ml_analysis += """
+1. **Random Forest Regressor** ⭐⭐⭐⭐⭐
+   - Expected R² Score: 0.80-0.95
+   - Best for: Non-linear relationships, feature importance
+   - Pros: Robust, handles outliers well
+2. **Linear Regression** ⭐⭐⭐⭐
+   - Expected R² Score: 0.70-0.85
+   - Best for: Linear relationships, interpretability
+   - Pros: Fast, interpretable, baseline model
+3. **Support Vector Regression (SVR)** ⭐⭐⭐⭐
+   - Expected R² Score: 0.75-0.90
+   - Best for: Non-linear patterns, robust predictions
+   - Pros: Effective for complex relationships
+4. **Decision Tree Regressor** ⭐⭐⭐
+   - Expected R² Score: 0.65-0.80
+   - Best for: Non-linear, interpretable rules
+   - Pros: Easy to understand decision path
+"""
+        # Data preprocessing recommendations
+        ml_analysis += "\n### 🛠️ Data Preprocessing Recommendations:\n"
+        missing_data = self.df.isnull().sum().sum()
+        if missing_data > 0:
+            ml_analysis += f"- **Handle Missing Data**: {missing_data} missing values need attention\n"
+        if len(categorical_cols) > 0:
+            ml_analysis += "- **Encode Categorical Variables**: Use Label Encoding or One-Hot Encoding\n"
+        if len(numeric_cols) > 1:
+            ml_analysis += "- **Feature Scaling**: Consider StandardScaler for SVM/KNN models\n"
+        outliers_detected = False
+        for col in numeric_cols:
+            Q1 = self.df[col].quantile(0.25)
+            Q3 = self.df[col].quantile(0.75)
+            IQR = Q3 - Q1
+            outliers = ((self.df[col] < (Q1 - 1.5 * IQR)) | (self.df[col] > (Q3 + 1.5 * IQR))).sum()
+            if outliers > len(self.df) * 0.05:  # More than 5% outliers
+                outliers_detected = True
+                break
+        if outliers_detected:
+            ml_analysis += "- **Handle Outliers**: Detected outliers that may affect model performance\n"
+        return ml_analysis
+    def _get_visualization_recommendations(self):
+        """Provide specific chart recommendations for variables"""
+        if self.df is None:
+            return ""
+        viz_analysis = "\n\n## 📊 DATA VISUALIZATION RECOMMENDATIONS\n\n"
+        numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
+        categorical_cols = self.df.select_dtypes(include=['object']).columns.tolist()
+        # Single variable visualizations
+        viz_analysis += "### 📈 Single Variable Analysis:\n\n"
+        for col in numeric_cols:
+            viz_analysis += f"**{col}** (Numeric):\n"
+            viz_analysis += f"- **Histogram**: Show distribution of {col}\n"
+            viz_analysis += f"- **Box Plot**: Identify outliers in {col}\n"
+            viz_analysis += f"- **Density Plot**: Smooth distribution curve for {col}\n\n"
+        for col in categorical_cols:
+            unique_count = self.df[col].nunique()
+            viz_analysis += f"**{col}** (Categorical - {unique_count} categories):\n"
+            if unique_count <= 10:
+                viz_analysis += f"- **Bar Chart**: Count of each category in {col}\n"
+                viz_analysis += f"- **Pie Chart**: Proportion of categories in {col}\n"
             else:
+                viz_analysis += f"- **Bar Chart**: Top 10 categories in {col}\n"
+            viz_analysis += f"- **Donut Chart**: Alternative to pie chart for {col}\n\n"
+        # Two variable relationships
+        if len(self.df.columns) > 1:
+            viz_analysis += "### 🔗 Two Variable Relationships:\n\n"
+            # Numeric vs Numeric
+            if len(numeric_cols) >= 2:
+                viz_analysis += "**Numeric vs Numeric Combinations:**\n"
+                for i in range(len(numeric_cols)):
+                    for j in range(i+1, len(numeric_cols)):
+                        col1, col2 = numeric_cols[i], numeric_cols[j]
+                        viz_analysis += f"- **Scatter Plot**: {col1} (X-axis) vs {col2} (Y-axis)\n"
+                        viz_analysis += f"- **Correlation Heatmap**: Relationship strength between {col1} and {col2}\n"
+                viz_analysis += "\n"
+            # Categorical vs Numeric
+            if len(categorical_cols) > 0 and len(numeric_cols) > 0:
+                viz_analysis += "**Categorical vs Numeric Combinations:**\n"
+                for cat_col in categorical_cols:
+                    for num_col in numeric_cols:
+                        viz_analysis += f"- **Box Plot**: {cat_col} (X-axis) vs {num_col} (Y-axis)\n"
+                        viz_analysis += f"- **Violin Plot**: Distribution of {num_col} across {cat_col} categories\n"
+                        viz_analysis += f"- **Bar Plot**: Average {num_col} by {cat_col}\n"
+                viz_analysis += "\n"
+            # Categorical vs Categorical
+            if len(categorical_cols) >= 2:
+                viz_analysis += "**Categorical vs Categorical Combinations:**\n"
+                for i in range(len(categorical_cols)):
+                    for j in range(i+1, len(categorical_cols)):
+                        col1, col2 = categorical_cols[i], categorical_cols[j]
+                        viz_analysis += f"- **Stacked Bar Chart**: {col1} (X-axis) stacked by {col2}\n"
+                        viz_analysis += f"- **Heatmap**: Cross-tabulation of {col1} vs {col2}\n"
+                        viz_analysis += f"- **Grouped Bar Chart**: {col1} grouped by {col2}\n"
+                viz_analysis += "\n"
+        # Advanced visualizations
+        if len(self.df.columns) >= 3:
+            viz_analysis += "### 🎨 Advanced Multi-Variable Analysis:\n\n"
+            if len(numeric_cols) >= 3:
+                viz_analysis += "**For 3+ Numeric Variables:**\n"
+                viz_analysis += f"- **3D Scatter Plot**: {numeric_cols[0]} (X) vs {numeric_cols[1]} (Y) vs {numeric_cols[2]} (Z)\n"
+                viz_analysis += f"- **Pair Plot**: All numeric variables against each other\n"
+                viz_analysis += f"- **Correlation Matrix**: Heatmap of all numeric correlations\n\n"
+            if len(numeric_cols) >= 2 and len(categorical_cols) >= 1:
+                viz_analysis += "**For Mixed Variable Types:**\n"
+                viz_analysis += f"- **Scatter Plot with Color**: {numeric_cols[0]} vs {numeric_cols[1]} colored by {categorical_cols[0]}\n"
+                viz_analysis += f"- **Bubble Chart**: {numeric_cols[0]} (X) vs {numeric_cols[1]} (Y) with bubble size from another variable\n\n"
+        # Dashboard recommendations
+        viz_analysis += "### 📋 Dashboard Layout Suggestions:\n\n"
+        viz_analysis += "**Top Row**: Overview metrics and key KPIs\n"
+        viz_analysis += "**Middle Section**: Main analysis charts (2-3 key visualizations)\n"
+        viz_analysis += "**Bottom Section**: Detailed breakdowns and filters\n"
+        viz_analysis += "**Side Panel**: Interactive filters and controls\n"
+        return viz_analysis
+    def generate_business_insights(self, question):
+        """Generate business insights based on the question and dataset"""
+        if self.df is None:
+            return "Please upload a dataset first to generate insights."
+        insights = f"""
+# 💡 BUSINESS INSIGHTS & RECOMMENDATIONS
+## Question: {question}
+## 📊 Data-Driven Analysis:
+"""
+        # Basic statistics
+        numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
+        categorical_cols = self.df.select_dtypes(include=['object']).columns.tolist()
+        if len(numeric_cols) > 0:
+            insights += "\n### 📈 Key Metrics:\n"
+            for col in numeric_cols[:5]:  # Show top 5 numeric columns
+                mean_val = self.df[col].mean()
+                median_val = self.df[col].median()
+                std_val = self.df[col].std()
+                insights += f"- **{col}**: Mean = {mean_val:.2f}, Median = {median_val:.2f}, Std = {std_val:.2f}\n"
+        if len(categorical_cols) > 0:
+            insights += "\n### 📋 Category Distribution:\n"
+            for col in categorical_cols[:3]:  # Show top 3 categorical columns
+                top_category = self.df[col].mode()[0]
+                category_count = self.df[col].value_counts().iloc[0]
+                total_count = len(self.df)
+                percentage = (category_count / total_count) * 100
+                insights += f"- **{col}**: Most common = '{top_category}' ({category_count}/{total_count} = {percentage:.1f}%)\n"
+        # Generate recommendations based on question keywords
+        question_lower = question.lower()
+        if any(word in question_lower for word in ['revenue', 'sales', 'profit', 'income']):
+            insights += "\n### 💰 Revenue/Sales Insights:\n"
+            insights += "- Focus on high-performing segments identified in the data\n"
+            insights += "- Analyze seasonal trends if time data is available\n"
+            insights += "- Consider customer segmentation based on purchase behavior\n"
+        elif any(word in question_lower for word in ['customer', 'client', 'user']):
+            insights += "\n### 👥 Customer Insights:\n"
+            insights += "- Segment customers based on key characteristics\n"
+            insights += "- Identify high-value customer profiles\n"
+            insights += "- Analyze customer retention patterns\n"
+        elif any(word in question_lower for word in ['marketing', 'campaign', 'advertising']):
+            insights += "\n### 📢 Marketing Insights:\n"
+            insights += "- Evaluate campaign performance metrics\n"
+            insights += "- Identify most effective channels\n"
+            insights += "- Optimize targeting based on demographic data\n"
+        elif any(word in question_lower for word in ['predict', 'forecast', 'future']):
+            insights += "\n### 🔮 Predictive Insights:\n"
+            insights += "- Use historical patterns for forecasting\n"
+            insights += "- Apply machine learning models for predictions\n"
+            insights += "- Consider external factors that might influence outcomes\n"
         else:
+            insights += "\n### 🎯 General Business Recommendations:\n"
+            insights += "- Identify key performance indicators from your data\n"
+            insights += "- Look for correlations between important variables\n"
+            insights += "- Consider segmentation strategies based on data patterns\n"
+        # Add data quality assessment
+        missing_data_pct = (self.df.isnull().sum().sum() / (self.df.shape[0] * self.df.shape[1])) * 100
+        insights += f"\n### ⚠️ Data Quality Notes:\n"
+        insights += f"- Missing data: {missing_data_pct:.1f}% of total data points\n"
+        insights += f"- Data completeness: {100-missing_data_pct:.1f}%\n"
+        if missing_data_pct > 10:
+            insights += "- **Recommendation**: Address missing data before making critical decisions\n"
+        return insights
+    def create_visualization(self, chart_type, x_column, y_column):
+        """Create visualizations based on user selection"""
+        if self.df is None:
+            return "Please upload a dataset first."
+        try:
+            plt.figure(figsize=(10, 6))
+            plt.style.use('default')
+            if chart_type == "Scatter Plot":
+                plt.scatter(self.df[x_column], self.df[y_column], alpha=0.6)
+                plt.xlabel(x_column)
+                plt.ylabel(y_column)
+                plt.title(f'Scatter Plot: {x_column} vs {y_column}')
+            elif chart_type == "Line Chart":
+                plt.plot(self.df[x_column], self.df[y_column])
+                plt.xlabel(x_column)
+                plt.ylabel(y_column)
+                plt.title(f'Line Chart: {x_column} vs {y_column}')
+            elif chart_type == "Bar Chart":
+                if self.df[x_column].dtype == 'object':
+                    value_counts = self.df[x_column].value_counts().head(10)
+                    plt.bar(value_counts.index, value_counts.values)
+                    plt.xlabel(x_column)
+                    plt.ylabel('Count')
+                    plt.title(f'Bar Chart: {x_column}')
+                    plt.xticks(rotation=45)
+                else:
+                    plt.bar(self.df[x_column], self.df[y_column])
+                    plt.xlabel(x_column)
+                    plt.ylabel(y_column)
+                    plt.title(f'Bar Chart: {x_column} vs {y_column}')
+            elif chart_type == "Histogram":
+                plt.hist(self.df[x_column], bins=30, alpha=0.7)
+                plt.xlabel(x_column)
+                plt.ylabel('Frequency')
+                plt.title(f'Histogram: {x_column}')
+            elif chart_type == "Box Plot":
+                if y_column and self.df[y_column].dtype == 'object':
+                    self.df.boxplot(column=x_column, by=y_column)
+                    plt.title(f'Box Plot: {x_column} by {y_column}')
+                else:
+                    plt.boxplot(self.df[x_column].dropna())
+                    plt.ylabel(x_column)
+                    plt.title(f'Box Plot: {x_column}')
+            plt.tight_layout()
+            # Save plot to bytes
+            img_buffer = io.BytesIO()
+            plt.savefig(img_buffer, format='png', dpi=150, bbox_inches='tight')
+            img_buffer.seek(0)
+            plt.close()
+            return img_buffer.getvalue()
+        except Exception as e:
+            return f"Error creating visualization: {str(e)}"
+# Initialize the Business Analyst GPT
+analyst = BusinessAnalystGPT()
+# Define the Gradio interface
+def analyze_file(file):
+    return analyst.analyze_dataset(file)
+def generate_insights(question):
+    return analyst.generate_business_insights(question)
+def create_chart(chart_type, x_col, y_col):
+    result = analyst.create_visualization(chart_type, x_col, y_col)
+    if isinstance(result, bytes):
+        return result
+    else:
+        return result
+def get_columns():
+    if analyst.df is not None:
+        return gr.update(choices=list(analyst.df.columns)), gr.update(choices=list(analyst.df.columns))
+    return gr.update(choices=[]), gr.update(choices=[])
+# Create the Gradio interface
+with gr.Blocks(title="Business Analyst GPT", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🤖 Business Analyst GPT
+    ### Your AI-Powered Data Analysis Assistant
+    Upload your dataset and get comprehensive business insights, ML model recommendations, and visualization suggestions!
+    """)
+    with gr.Tab("📊 Dataset Analysis"):
+        with gr.Row():
+            file_input = gr.File(label="Upload your dataset (CSV or Excel)", file_types=[".csv", ".xlsx", ".xls"])
+            analyze_btn = gr.Button("🔍 Analyze Dataset", variant="primary")
+        analysis_output = gr.Markdown(label="Analysis Results")
+        analyze_btn.click(analyze_file, inputs=[file_input], outputs=[analysis_output])
+    with gr.Tab("💡 Business Insights"):
+        with gr.Row():
+            question_input = gr.Textbox(
+                label="Ask a business question about your data",
+                placeholder="e.g., How can I increase revenue? What are the key customer segments?",
+                lines=2
+            )
+            insights_btn = gr.Button("💡 Generate Insights", variant="primary")
+        insights_output = gr.Markdown(label="Business Insights")
+        insights_btn.click(generate_insights, inputs=[question_input], outputs=[insights_output])
+    with gr.Tab("📈 Data Visualization"):
+        with gr.Row():
+            chart_type = gr.Dropdown(
+                choices=["Scatter Plot", "Line Chart", "Bar Chart", "Histogram", "Box Plot"],
+                label="Chart Type",
+                value="Scatter Plot"
+            )
+            refresh_cols = gr.Button("🔄 Refresh Columns")
+        with gr.Row():
+            x_column = gr.Dropdown(choices=[], label="X-axis Column")
+            y_column = gr.Dropdown(choices=[], label="Y-axis Column (optional for some charts)")
+        create_viz_btn = gr.Button("📊 Create Visualization", variant="primary")
+        viz_output = gr.Image(label="Visualization")
+        refresh_cols.click(get_columns, outputs=[x_column, y_column])
+        create_viz_btn.click(create_chart, inputs=[chart_type, x_column, y_column], outputs=[viz_output])
+    with gr.Tab("ℹ️ How to Use"):
+        gr.Markdown("""
+        ## 🚀 How to Use Business Analyst GPT
+        ### Step 1: Upload Your Dataset
+        - Click on "Dataset Analysis" tab
+        - Upload a CSV or Excel file containing your business data
+        - Click "Analyze Dataset" to get comprehensive insights
+        ### Step 2: Get ML Model Recommendations
+        After uploading, you'll receive:
+        - **Target Variable Suggestions**: Which columns can be predicted
+        - **Feature Variable Identification**: Which columns to use as predictors
+        - **Model Recommendations**: Best ML algorithms for your data
+        - **Expected Performance**: Accuracy estimates for each model
+        ### Step 3: Get Specific Visualization Ideas
+        The analysis will provide:
+        - **Single Variable Charts**: Best charts for each column
+        - **Two Variable Relationships**: Specific X-axis and Y-axis recommendations
+        - **Advanced Visualizations**: Multi-variable analysis suggestions
+        - **Dashboard Layout**: How to organize your charts
+        ### Step 4: Generate Business Insights
+        - Ask specific business questions about your data
+        - Get data-driven recommendations and insights
+        - Receive actionable business strategies
+        ### Step 5: Create Visualizations
+        - Choose from various chart types
+        - Select specific columns for X and Y axes
+        - Generate publication-ready charts
+        ## 📋 Supported File Types
+        - CSV files (.csv)
+        - Excel files (.xlsx, .xls)
+        ## 🎯 Best Practices
+        1. **Clean Data**: Ensure your dataset has clear column headers
+        2. **Relevant Questions**: Ask specific business questions for better insights
+        3. **Column Selection**: Choose appropriate columns for visualizations
+        4. **Data Size**: Larger datasets provide more reliable ML recommendations
+        """)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()