import gradio as gr import pandas as pd import numpy as np from datetime import datetime import warnings warnings.filterwarnings('ignore') def smart_dataset_analysis(file): """Complete dataset analysis with intelligent recommendations""" if not file: return "Upload a dataset to get intelligent analysis." try: # Smart file reading df = pd.read_csv(file.name) if file.name.endswith('.csv') else pd.read_excel(file.name) # Deep dataset profiling profile = { 'shape': df.shape, 'numeric_cols': list(df.select_dtypes(include=[np.number]).columns), 'categorical_cols': list(df.select_dtypes(include=['object', 'category']).columns), 'datetime_cols': list(df.select_dtypes(include=['datetime64']).columns), 'missing_data': df.isnull().sum().to_dict(), 'duplicates': df.duplicated().sum(), 'memory_mb': round(df.memory_usage(deep=True).sum() / 1024**2, 2) } # Smart target detection with scoring target_scores = {} target_keywords = ['target', 'label', 'class', 'outcome', 'price', 'sales', 'churn', 'rating', 'status'] for col in df.columns: score = 0 col_lower = col.lower() # Keyword matching score += 15 if any(kw in col_lower for kw in target_keywords) else 0 # Statistical characteristics if col in profile['numeric_cols']: unique_ratio = df[col].nunique() / len(df) if 0.02 < unique_ratio < 0.95: # Good target range score += 10 if df[col].std() > 0: # Has variance score += 5 elif col in profile['categorical_cols']: unique_count = df[col].nunique() if 2 <= unique_count <= 20: # Good classification range score += 12 score += 5 if unique_count == 2 else 0 # Binary bonus # Position bias (targets often at end) if list(df.columns).index(col) >= len(df.columns) - 3: score += 3 target_scores[col] = score # Get top targets top_targets = sorted(target_scores.items(), key=lambda x: x[1], reverse=True)[:3] smart_targets = [col for col, score in top_targets if score > 8] # Generate ML recommendations for each target ml_recommendations = [] for target in smart_targets[:2]: # Top 2 targets target_type = 'numeric' if target in profile['numeric_cols'] else 'categorical' unique_vals = df[target].nunique() # Smart model selection based on actual data if target_type == 'numeric': skewness = df[target].skew() if abs(skewness) > 2: models = ['XGBoost Regressor', 'Random Forest', 'Gradient Boosting'] reason = f"Highly skewed data (skew: {skewness:.2f}) - tree models handle non-linearity" else: models = ['Linear Regression', 'Ridge/Lasso', 'Random Forest'] reason = f"Normal distribution - linear models suitable" else: if unique_vals == 2: models = ['Logistic Regression', 'XGBoost Classifier', 'Random Forest'] reason = f"Binary classification - balanced approach recommended" elif unique_vals <= 5: models = ['Multinomial Logistic', 'XGBoost', 'Random Forest'] reason = f"{unique_vals} classes - multi-class classification" else: models = ['XGBoost', 'Neural Networks', 'Random Forest'] reason = f"{unique_vals} classes - complex multi-class problem" ml_recommendations.append({ 'target': target, 'type': target_type, 'unique_vals': unique_vals, 'models': models, 'reason': reason, 'features': [c for c in df.columns if c != target] }) # Smart visualization recommendations viz_recs = [] # Executive KPIs for col in profile['numeric_cols'][:4]: viz_recs.append(f"📊 KPI Card: {col} (Current: {df[col].iloc[-1]:.2f}, Trend: {'↗️' if df[col].corr(pd.Series(range(len(df)))) > 0 else '↘️'})") # Comparative analysis for cat in profile['categorical_cols'][:2]: for num in profile['numeric_cols'][:2]: if df[cat].nunique() <= 10: avg_by_cat = df.groupby(cat)[num].mean().round(2).to_dict() viz_recs.append(f"📊 Bar Chart: {num} by {cat} - Averages: {avg_by_cat}") # Correlation insights if len(profile['numeric_cols']) >= 2: corr_matrix = df[profile['numeric_cols']].corr() strongest_corr = corr_matrix.abs().unstack().sort_values(ascending=False) strongest_corr = strongest_corr[strongest_corr < 1.0].head(3) for (var1, var2), corr_val in strongest_corr.items(): viz_recs.append(f"🔗 Scatter Plot: {var1} vs {var2} (Correlation: {corr_val:.3f})") # Distribution insights for col in profile['numeric_cols'][:3]: q1, q3 = df[col].quantile([0.25, 0.75]) outliers = len(df[(df[col] < q1 - 1.5*(q3-q1)) | (df[col] > q3 + 1.5*(q3-q1))]) viz_recs.append(f"📈 Histogram: {col} distribution (Outliers: {outliers}, Skew: {df[col].skew():.2f})") # Generate comprehensive report report = f"""# 🧠 INTELLIGENT DATASET ANALYSIS ## 📊 Dataset Intelligence **Rows:** {profile['shape'][0]:,} | **Columns:** {profile['shape'][1]} | **Size:** {profile['memory_mb']} MB | **Duplicates:** {profile['duplicates']} **Data Quality Score:** {((len(df) - sum(profile['missing_data'].values())) / (len(df) * len(df.columns)) * 100):.1f}% ## 🎯 SMART TARGET DETECTION """ for i, (target, score) in enumerate(top_targets[:3], 1): target_info = f"**{i}. {target}** (Confidence: {score}/20)" if target in smart_targets: target_info += f" ✅ **RECOMMENDED**" if target in profile['numeric_cols']: target_info += f" | Values: {df[target].min():.2f} - {df[target].max():.2f} | Mean: {df[target].mean():.2f}" else: top_values = df[target].value_counts().head(3).to_dict() target_info += f" | Top categories: {top_values}" report += f"{target_info}\n" report += f"\n## 🤖 ML MODEL RECOMMENDATIONS\n" for i, rec in enumerate(ml_recommendations, 1): report += f"""### Scenario {i}: Predict `{rec['target']}` **Problem Type:** {rec['type'].upper()} {'REGRESSION' if rec['type'] == 'numeric' else 'CLASSIFICATION'} **AI Reasoning:** {rec['reason']} **Recommended Models:** 1. 🥇 **{rec['models'][0]}** (Primary choice) 2. 🥈 **{rec['models'][1]}** (Alternative) 3. 🥉 **{rec['models'][2]}** (Backup option) **Features Available:** {len(rec['features'])} variables **Data Split:** {int(len(df)*0.8):,} train / {int(len(df)*0.2):,} test """ # Data preprocessing recommendations missing_cols = [col for col, missing in profile['missing_data'].items() if missing > 0] high_cardinality = [col for col in profile['categorical_cols'] if df[col].nunique() > 20] report += f"""## ⚙️ PREPROCESSING PIPELINE **Missing Data:** {len(missing_cols)} columns need attention """ if missing_cols: for col in missing_cols[:5]: missing_pct = (profile['missing_data'][col] / len(df)) * 100 strategy = "Drop column" if missing_pct > 50 else ("Median fill" if col in profile['numeric_cols'] else "Mode fill") report += f"- `{col}`: {missing_pct:.1f}% missing → {strategy}\n" if high_cardinality: report += f"**High Cardinality:** {len(high_cardinality)} categorical columns need encoding\n" for col in high_cardinality[:3]: report += f"- `{col}`: {df[col].nunique()} categories → Target encoding recommended\n" report += f"\n## 📊 SMART VISUALIZATIONS\n" for viz in viz_recs: report += f"{viz}\n" # Performance predictions sample_size_category = "Large" if len(df) > 50000 else ("Medium" if len(df) > 5000 else "Small") feature_ratio = len(df.columns) / len(df) report += f""" ## 🚀 IMPLEMENTATION ROADMAP ### Phase 1: Data Preparation (Week 1) - Handle {len(missing_cols)} missing data issues - Encode {len(profile['categorical_cols'])} categorical variables - Feature scaling for {len(profile['numeric_cols'])} numeric variables ### Phase 2: Model Development (Week 2-3) - {sample_size_category} dataset → Expected training time: {'Minutes' if len(df) < 10000 else 'Hours'} - Feature importance analysis using top recommended models - Cross-validation with {5 if len(df) > 1000 else 3}-fold strategy ### Phase 3: Production (Week 4) - Model deployment pipeline - Performance monitoring dashboard - A/B testing framework ## 💡 BUSINESS IMPACT PREDICTION **Model Accuracy Expectation:** {85 if len(df) > 10000 and len(missing_cols) < 3 else 75}%+ **ROI Timeline:** 3-6 months **Automation Potential:** {min(90, len(profile['numeric_cols']) * 10)}% of manual analysis """ return report except Exception as e: return f"Analysis failed: {str(e)}\nEnsure file is valid CSV/Excel format." def smart_ba_chat(message, history): """Intelligent BA assistant with contextual responses""" # Smart response generation based on keywords keywords = message.lower() if any(word in keywords for word in ['requirement', 'functional', 'specification']): response = f"""## 📋 Smart Requirements Analysis for: "{message}" ### 🎯 Identified Requirements Type **Primary:** {'Functional' if 'function' in keywords else 'Business'} Requirements **Complexity:** {'High' if len(message.split()) > 20 else 'Medium'} ### 📝 Generated Requirements Framework 1. **Must Have (P1)** - Core functionality: {message.split('.')[0] if '.' in message else message[:50]} - User authentication and authorization - Data validation and error handling 2. **Should Have (P2)** - Reporting and analytics dashboard - Export/import capabilities - Audit trail functionality 3. **Could Have (P3)** - Advanced filtering options - Mobile responsiveness - Integration APIs ### ✅ Acceptance Criteria Template ``` Given: User has appropriate permissions When: User performs {message.split()[-1] if message.split() else 'action'} Then: System should respond within 3 seconds And: Changes should be logged for audit ``` ### 🔍 Next Steps - [ ] Stakeholder validation workshop - [ ] Technical feasibility assessment - [ ] Resource estimation and timeline """ elif any(word in keywords for word in ['process', 'workflow', 'procedure']): response = f"""## 🔄 Process Analysis for: "{message}" ### 📊 Current State Assessment **Process Complexity:** {'High' if 'complex' in keywords or 'multiple' in keywords else 'Medium'} **Stakeholders Involved:** {'Multiple departments' if 'department' in keywords else 'Single team'} ### 🎯 Identified Pain Points - Manual handoffs and delays - Lack of visibility and tracking - Inconsistent execution - No performance metrics ### 🚀 Recommended Solution **Automation Level:** {80 if 'automate' in keywords else 60}% **Expected Efficiency Gain:** {40 if 'improve' in keywords else 25}% **Implementation Timeline:** {'3-6 months' if 'large' in keywords else '6-12 weeks'} ### 📈 Process Optimization Steps 1. **Map Current State** (Week 1-2) 2. **Identify Bottlenecks** (Week 3) 3. **Design Future State** (Week 4-5) 4. **Pilot Implementation** (Week 6-8) 5. **Full Rollout** (Week 9-12) ### 🎯 Success Metrics - Process cycle time reduction: 50%+ - Error rate reduction: 80%+ - User satisfaction score: 4.5+/5 """ elif any(word in keywords for word in ['dashboard', 'report', 'visual', 'chart']): response = f"""## 📊 Visualization Strategy for: "{message}" ### 🎨 Smart Chart Recommendations **Data Type Detected:** {'Time-series' if 'time' in keywords or 'trend' in keywords else 'Categorical'} **Audience Level:** {'Executive' if 'executive' in keywords else 'Operational'} ### 📈 Recommended Visualizations 1. **KPI Dashboard** - Primary metrics with trend indicators - Color-coded status (Red/Yellow/Green) - Real-time data refresh 2. **Comparative Analysis** - Bar charts for category comparison - Heat maps for correlation analysis - Scatter plots for relationship insights 3. **Trend Analysis** - Line charts for time-based data - Area charts for cumulative metrics - Waterfall charts for variance analysis ### 🎯 Dashboard Layout Strategy ``` ┌─────────────────────────────────────────┐ │ Executive Summary KPIs │ ├─────────────────┬───────────────────────┤ │ Primary Chart │ Filter Controls │ │ (60% width) │ & Drill-downs │ ├─────────────────┴───────────────────────┤ │ Supporting Analytics │ └─────────────────────────────────────────┘ ``` ### 💡 Business Intelligence Features - Interactive filtering and drill-down - Automated insights and anomaly detection - Mobile-responsive design - Export capabilities (PDF, Excel, PowerPoint) """ else: # Generic intelligent response response = f"""## 🧠 Smart Business Analysis for: "{message}" ### 🎯 Analysis Summary **Domain:** {'Technology' if any(tech in keywords for tech in ['system', 'software', 'digital', 'ai', 'ml']) else 'Business Operations'} **Complexity:** {'High' if len(message.split()) > 15 else 'Medium'} **Urgency:** {'High' if any(urgent in keywords for urgent in ['urgent', 'asap', 'critical']) else 'Normal'} ### 📊 Key Considerations 1. **Stakeholder Impact Analysis** - Primary users: Business operations team - Secondary users: Management and IT - Decision makers: Department heads 2. **Success Criteria Definition** - Quantitative: ROI > 20%, Time savings > 30% - Qualitative: User satisfaction, Process efficiency - Timeline: {'6-8 weeks' if 'quick' in keywords else '3-4 months'} 3. **Risk Assessment** - Technical: {'Medium' if 'technical' in keywords else 'Low'} - Business: {'High' if 'change' in keywords else 'Medium'} - Resource: Based on scope and timeline ### 🚀 Recommended Action Plan **Phase 1:** Requirement gathering and stakeholder alignment **Phase 2:** Solution design and prototype development **Phase 3:** Implementation and testing **Phase 4:** Deployment and change management ### 💡 Next Steps - Schedule stakeholder interviews - Define detailed acceptance criteria - Create project timeline and milestones - Identify potential risks and mitigation strategies """ return response # Streamlined Gradio Interface with gr.Blocks(title="Smart Business Analyst GPT", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🧠 Smart Business Analyst GPT\n## AI-Powered Dataset Analysis & Business Intelligence") with gr.Tabs(): with gr.TabItem("💬 Intelligent BA Assistant"): chatbot = gr.Chatbot(height=500, label="Smart BA Assistant") msg = gr.Textbox(placeholder="Describe your business challenge...", label="Your Question", lines=2) with gr.Row(): submit = gr.Button("🚀 Get Smart Analysis", variant="primary") clear = gr.Button("Clear") with gr.TabItem("📊 Complete Dataset Analysis"): gr.Markdown("### Upload your dataset for comprehensive AI analysis") file_upload = gr.File(label="Upload CSV/Excel", file_types=[".csv", ".xlsx", ".xls"]) analyze_btn = gr.Button("🧠 Complete Analysis", variant="primary", size="lg") analysis_output = gr.Textbox(label="Intelligent Analysis Report", lines=30, show_copy_button=True) # Event handlers def chat_respond(message, history): response = smart_ba_chat(message, history) history.append((message, response)) return "", history msg.submit(chat_respond, [msg, chatbot], [msg, chatbot]) submit.click(chat_respond, [msg, chatbot], [msg, chatbot]) clear.click(lambda: [], None, chatbot) analyze_btn.click(smart_dataset_analysis, file_upload, analysis_output) if __name__ == "__main__": demo.launch()