import gradio as gr import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.svm import SVC, SVR from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.metrics import accuracy_score, r2_score, mean_squared_error import matplotlib.pyplot as plt import seaborn as sns import io import base64 import warnings warnings.filterwarnings('ignore') class BusinessAnalystGPT: def __init__(self): self.df = None self.analysis_results = "" def analyze_dataset(self, file): """Analyze uploaded dataset and provide comprehensive insights""" try: # Read the dataset if file.name.endswith('.csv'): self.df = pd.read_csv(file.name) elif file.name.endswith(('.xlsx', '.xls')): self.df = pd.read_excel(file.name) else: return "Error: Please upload a CSV or Excel file." # Basic dataset info analysis = f""" # 📊 DATASET ANALYSIS REPORT ## 📈 Basic Information - **Dataset Shape**: {self.df.shape[0]} rows × {self.df.shape[1]} columns - **Memory Usage**: {self.df.memory_usage(deep=True).sum() / 1024:.2f} KB - **Missing Values**: {self.df.isnull().sum().sum()} total ## 📋 Column Information """ # Column details for i, col in enumerate(self.df.columns): dtype = str(self.df[col].dtype) missing = self.df[col].isnull().sum() unique_vals = self.df[col].nunique() analysis += f"\n**{i+1}. {col}**\n" analysis += f" - Data Type: {dtype}\n" analysis += f" - Missing Values: {missing} ({missing/len(self.df)*100:.1f}%)\n" analysis += f" - Unique Values: {unique_vals}\n" if dtype in ['int64', 'float64']: analysis += f" - Range: {self.df[col].min():.2f} to {self.df[col].max():.2f}\n" analysis += f" - Mean: {self.df[col].mean():.2f}\n" elif dtype == 'object': top_values = self.df[col].value_counts().head(3) analysis += f" - Top Values: {list(top_values.index)}\n" # Add ML Model Recommendations analysis += self._get_ml_recommendations() # Add Visualization Recommendations analysis += self._get_visualization_recommendations() self.analysis_results = analysis return analysis except Exception as e: return f"Error analyzing dataset: {str(e)}" def _get_ml_recommendations(self): """Analyze dataset and recommend suitable ML models with variable suggestions""" if self.df is None: return "" ml_analysis = "\n\n## 🤖 MACHINE LEARNING MODEL RECOMMENDATIONS\n\n" # Identify variable types numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist() categorical_cols = self.df.select_dtypes(include=['object']).columns.tolist() ml_analysis += "### 🎯 Potential Target Variables (Dependent Variables):\n" # Suggest target variables based on data characteristics target_suggestions = [] for col in numeric_cols: unique_ratio = self.df[col].nunique() / len(self.df) if unique_ratio < 0.1 and self.df[col].nunique() <= 10: target_suggestions.append((col, "Classification", f"Has {self.df[col].nunique()} unique values - good for classification")) elif unique_ratio > 0.1: target_suggestions.append((col, "Regression", "Continuous values - suitable for regression")) for col in categorical_cols: if self.df[col].nunique() <= 10: target_suggestions.append((col, "Classification", f"Categorical with {self.df[col].nunique()} classes")) if target_suggestions: for var, task_type, reason in target_suggestions: ml_analysis += f"- **{var}** ({task_type}): {reason}\n" else: ml_analysis += "- No clear target variables identified. Please specify based on your business objective.\n" ml_analysis += "\n### 📊 Feature Variables (Independent Variables):\n" # List potential feature variables all_cols = list(self.df.columns) if len(numeric_cols) > 0: ml_analysis += f"- **Numeric Features**: {', '.join(numeric_cols)}\n" if len(categorical_cols) > 0: ml_analysis += f"- **Categorical Features**: {', '.join(categorical_cols)}\n" # Model recommendations based on data characteristics ml_analysis += "\n### 🔮 Recommended Models & Expected Performance:\n\n" # Classification models if any("Classification" in suggestion[1] for suggestion in target_suggestions): ml_analysis += "#### 🎯 For Classification Tasks:\n" ml_analysis += """ 1. **Random Forest Classifier** ⭐⭐⭐⭐⭐ - Expected Accuracy: 85-95% - Best for: Mixed data types, feature importance - Pros: Handles missing values, no overfitting 2. **Logistic Regression** ⭐⭐⭐⭐ - Expected Accuracy: 75-85% - Best for: Linear relationships, interpretability - Pros: Fast, interpretable coefficients 3. **Decision Tree** ⭐⭐⭐ - Expected Accuracy: 70-80% - Best for: Rule-based decisions, interpretability - Pros: Easy to understand and visualize 4. **Support Vector Machine (SVM)** ⭐⭐⭐⭐ - Expected Accuracy: 80-90% - Best for: High-dimensional data, small datasets - Pros: Effective for complex patterns 5. **K-Nearest Neighbors (KNN)** ⭐⭐⭐ - Expected Accuracy: 70-85% - Best for: Simple patterns, small datasets - Pros: Simple, no assumptions about data """ # Regression models if any("Regression" in suggestion[1] for suggestion in target_suggestions): ml_analysis += "\n#### 📈 For Regression Tasks:\n" ml_analysis += """ 1. **Random Forest Regressor** ⭐⭐⭐⭐⭐ - Expected R² Score: 0.80-0.95 - Best for: Non-linear relationships, feature importance - Pros: Robust, handles outliers well 2. **Linear Regression** ⭐⭐⭐⭐ - Expected R² Score: 0.70-0.85 - Best for: Linear relationships, interpretability - Pros: Fast, interpretable, baseline model 3. **Support Vector Regression (SVR)** ⭐⭐⭐⭐ - Expected R² Score: 0.75-0.90 - Best for: Non-linear patterns, robust predictions - Pros: Effective for complex relationships 4. **Decision Tree Regressor** ⭐⭐⭐ - Expected R² Score: 0.65-0.80 - Best for: Non-linear, interpretable rules - Pros: Easy to understand decision path """ # Data preprocessing recommendations ml_analysis += "\n### 🛠️ Data Preprocessing Recommendations:\n" missing_data = self.df.isnull().sum().sum() if missing_data > 0: ml_analysis += f"- **Handle Missing Data**: {missing_data} missing values need attention\n" if len(categorical_cols) > 0: ml_analysis += "- **Encode Categorical Variables**: Use Label Encoding or One-Hot Encoding\n" if len(numeric_cols) > 1: ml_analysis += "- **Feature Scaling**: Consider StandardScaler for SVM/KNN models\n" outliers_detected = False for col in numeric_cols: Q1 = self.df[col].quantile(0.25) Q3 = self.df[col].quantile(0.75) IQR = Q3 - Q1 outliers = ((self.df[col] < (Q1 - 1.5 * IQR)) | (self.df[col] > (Q3 + 1.5 * IQR))).sum() if outliers > len(self.df) * 0.05: # More than 5% outliers outliers_detected = True break if outliers_detected: ml_analysis += "- **Handle Outliers**: Detected outliers that may affect model performance\n" return ml_analysis def _get_visualization_recommendations(self): """Provide specific chart recommendations for variables""" if self.df is None: return "" viz_analysis = "\n\n## 📊 DATA VISUALIZATION RECOMMENDATIONS\n\n" numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist() categorical_cols = self.df.select_dtypes(include=['object']).columns.tolist() # Single variable visualizations viz_analysis += "### 📈 Single Variable Analysis:\n\n" for col in numeric_cols: viz_analysis += f"**{col}** (Numeric):\n" viz_analysis += f"- **Histogram**: Show distribution of {col}\n" viz_analysis += f"- **Box Plot**: Identify outliers in {col}\n" viz_analysis += f"- **Density Plot**: Smooth distribution curve for {col}\n\n" for col in categorical_cols: unique_count = self.df[col].nunique() viz_analysis += f"**{col}** (Categorical - {unique_count} categories):\n" if unique_count <= 10: viz_analysis += f"- **Bar Chart**: Count of each category in {col}\n" viz_analysis += f"- **Pie Chart**: Proportion of categories in {col}\n" else: viz_analysis += f"- **Bar Chart**: Top 10 categories in {col}\n" viz_analysis += f"- **Donut Chart**: Alternative to pie chart for {col}\n\n" # Two variable relationships if len(self.df.columns) > 1: viz_analysis += "### 🔗 Two Variable Relationships:\n\n" # Numeric vs Numeric if len(numeric_cols) >= 2: viz_analysis += "**Numeric vs Numeric Combinations:**\n" for i in range(len(numeric_cols)): for j in range(i+1, len(numeric_cols)): col1, col2 = numeric_cols[i], numeric_cols[j] viz_analysis += f"- **Scatter Plot**: {col1} (X-axis) vs {col2} (Y-axis)\n" viz_analysis += f"- **Correlation Heatmap**: Relationship strength between {col1} and {col2}\n" viz_analysis += "\n" # Categorical vs Numeric if len(categorical_cols) > 0 and len(numeric_cols) > 0: viz_analysis += "**Categorical vs Numeric Combinations:**\n" for cat_col in categorical_cols: for num_col in numeric_cols: viz_analysis += f"- **Box Plot**: {cat_col} (X-axis) vs {num_col} (Y-axis)\n" viz_analysis += f"- **Violin Plot**: Distribution of {num_col} across {cat_col} categories\n" viz_analysis += f"- **Bar Plot**: Average {num_col} by {cat_col}\n" viz_analysis += "\n" # Categorical vs Categorical if len(categorical_cols) >= 2: viz_analysis += "**Categorical vs Categorical Combinations:**\n" for i in range(len(categorical_cols)): for j in range(i+1, len(categorical_cols)): col1, col2 = categorical_cols[i], categorical_cols[j] viz_analysis += f"- **Stacked Bar Chart**: {col1} (X-axis) stacked by {col2}\n" viz_analysis += f"- **Heatmap**: Cross-tabulation of {col1} vs {col2}\n" viz_analysis += f"- **Grouped Bar Chart**: {col1} grouped by {col2}\n" viz_analysis += "\n" # Advanced visualizations if len(self.df.columns) >= 3: viz_analysis += "### 🎨 Advanced Multi-Variable Analysis:\n\n" if len(numeric_cols) >= 3: viz_analysis += "**For 3+ Numeric Variables:**\n" viz_analysis += f"- **3D Scatter Plot**: {numeric_cols[0]} (X) vs {numeric_cols[1]} (Y) vs {numeric_cols[2]} (Z)\n" viz_analysis += f"- **Pair Plot**: All numeric variables against each other\n" viz_analysis += f"- **Correlation Matrix**: Heatmap of all numeric correlations\n\n" if len(numeric_cols) >= 2 and len(categorical_cols) >= 1: viz_analysis += "**For Mixed Variable Types:**\n" viz_analysis += f"- **Scatter Plot with Color**: {numeric_cols[0]} vs {numeric_cols[1]} colored by {categorical_cols[0]}\n" viz_analysis += f"- **Bubble Chart**: {numeric_cols[0]} (X) vs {numeric_cols[1]} (Y) with bubble size from another variable\n\n" # Dashboard recommendations viz_analysis += "### 📋 Dashboard Layout Suggestions:\n\n" viz_analysis += "**Top Row**: Overview metrics and key KPIs\n" viz_analysis += "**Middle Section**: Main analysis charts (2-3 key visualizations)\n" viz_analysis += "**Bottom Section**: Detailed breakdowns and filters\n" viz_analysis += "**Side Panel**: Interactive filters and controls\n" return viz_analysis def generate_business_insights(self, question): """Generate business insights based on the question and dataset""" if self.df is None: return "Please upload a dataset first to generate insights." insights = f""" # 💡 BUSINESS INSIGHTS & RECOMMENDATIONS ## Question: {question} ## 📊 Data-Driven Analysis: """ # Basic statistics numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist() categorical_cols = self.df.select_dtypes(include=['object']).columns.tolist() if len(numeric_cols) > 0: insights += "\n### 📈 Key Metrics:\n" for col in numeric_cols[:5]: # Show top 5 numeric columns mean_val = self.df[col].mean() median_val = self.df[col].median() std_val = self.df[col].std() insights += f"- **{col}**: Mean = {mean_val:.2f}, Median = {median_val:.2f}, Std = {std_val:.2f}\n" if len(categorical_cols) > 0: insights += "\n### 📋 Category Distribution:\n" for col in categorical_cols[:3]: # Show top 3 categorical columns top_category = self.df[col].mode()[0] category_count = self.df[col].value_counts().iloc[0] total_count = len(self.df) percentage = (category_count / total_count) * 100 insights += f"- **{col}**: Most common = '{top_category}' ({category_count}/{total_count} = {percentage:.1f}%)\n" # Generate recommendations based on question keywords question_lower = question.lower() if any(word in question_lower for word in ['revenue', 'sales', 'profit', 'income']): insights += "\n### 💰 Revenue/Sales Insights:\n" insights += "- Focus on high-performing segments identified in the data\n" insights += "- Analyze seasonal trends if time data is available\n" insights += "- Consider customer segmentation based on purchase behavior\n" elif any(word in question_lower for word in ['customer', 'client', 'user']): insights += "\n### 👥 Customer Insights:\n" insights += "- Segment customers based on key characteristics\n" insights += "- Identify high-value customer profiles\n" insights += "- Analyze customer retention patterns\n" elif any(word in question_lower for word in ['marketing', 'campaign', 'advertising']): insights += "\n### 📢 Marketing Insights:\n" insights += "- Evaluate campaign performance metrics\n" insights += "- Identify most effective channels\n" insights += "- Optimize targeting based on demographic data\n" elif any(word in question_lower for word in ['predict', 'forecast', 'future']): insights += "\n### 🔮 Predictive Insights:\n" insights += "- Use historical patterns for forecasting\n" insights += "- Apply machine learning models for predictions\n" insights += "- Consider external factors that might influence outcomes\n" else: insights += "\n### 🎯 General Business Recommendations:\n" insights += "- Identify key performance indicators from your data\n" insights += "- Look for correlations between important variables\n" insights += "- Consider segmentation strategies based on data patterns\n" # Add data quality assessment missing_data_pct = (self.df.isnull().sum().sum() / (self.df.shape[0] * self.df.shape[1])) * 100 insights += f"\n### ⚠️ Data Quality Notes:\n" insights += f"- Missing data: {missing_data_pct:.1f}% of total data points\n" insights += f"- Data completeness: {100-missing_data_pct:.1f}%\n" if missing_data_pct > 10: insights += "- **Recommendation**: Address missing data before making critical decisions\n" return insights def create_visualization(self, chart_type, x_column, y_column): """Create visualizations based on user selection""" if self.df is None: return "Please upload a dataset first." try: plt.figure(figsize=(10, 6)) plt.style.use('default') if chart_type == "Scatter Plot": plt.scatter(self.df[x_column], self.df[y_column], alpha=0.6) plt.xlabel(x_column) plt.ylabel(y_column) plt.title(f'Scatter Plot: {x_column} vs {y_column}') elif chart_type == "Line Chart": plt.plot(self.df[x_column], self.df[y_column]) plt.xlabel(x_column) plt.ylabel(y_column) plt.title(f'Line Chart: {x_column} vs {y_column}') elif chart_type == "Bar Chart": if self.df[x_column].dtype == 'object': value_counts = self.df[x_column].value_counts().head(10) plt.bar(value_counts.index, value_counts.values) plt.xlabel(x_column) plt.ylabel('Count') plt.title(f'Bar Chart: {x_column}') plt.xticks(rotation=45) else: plt.bar(self.df[x_column], self.df[y_column]) plt.xlabel(x_column) plt.ylabel(y_column) plt.title(f'Bar Chart: {x_column} vs {y_column}') elif chart_type == "Histogram": plt.hist(self.df[x_column], bins=30, alpha=0.7) plt.xlabel(x_column) plt.ylabel('Frequency') plt.title(f'Histogram: {x_column}') elif chart_type == "Box Plot": if y_column and self.df[y_column].dtype == 'object': self.df.boxplot(column=x_column, by=y_column) plt.title(f'Box Plot: {x_column} by {y_column}') else: plt.boxplot(self.df[x_column].dropna()) plt.ylabel(x_column) plt.title(f'Box Plot: {x_column}') plt.tight_layout() # Save plot to bytes img_buffer = io.BytesIO() plt.savefig(img_buffer, format='png', dpi=150, bbox_inches='tight') img_buffer.seek(0) plt.close() return img_buffer.getvalue() except Exception as e: return f"Error creating visualization: {str(e)}" # Initialize the Business Analyst GPT analyst = BusinessAnalystGPT() # Define the Gradio interface def analyze_file(file): return analyst.analyze_dataset(file) def generate_insights(question): return analyst.generate_business_insights(question) def create_chart(chart_type, x_col, y_col): result = analyst.create_visualization(chart_type, x_col, y_col) if isinstance(result, bytes): return result else: return result def get_columns(): if analyst.df is not None: return gr.update(choices=list(analyst.df.columns)), gr.update(choices=list(analyst.df.columns)) return gr.update(choices=[]), gr.update(choices=[]) # Create the Gradio interface with gr.Blocks(title="Business Analyst GPT", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🤖 Business Analyst GPT ### Your AI-Powered Data Analysis Assistant Upload your dataset and get comprehensive business insights, ML model recommendations, and visualization suggestions! """) with gr.Tab("📊 Dataset Analysis"): with gr.Row(): file_input = gr.File(label="Upload your dataset (CSV or Excel)", file_types=[".csv", ".xlsx", ".xls"]) analyze_btn = gr.Button("🔍 Analyze Dataset", variant="primary") analysis_output = gr.Markdown(label="Analysis Results") analyze_btn.click(analyze_file, inputs=[file_input], outputs=[analysis_output]) with gr.Tab("💡 Business Insights"): with gr.Row(): question_input = gr.Textbox( label="Ask a business question about your data", placeholder="e.g., How can I increase revenue? What are the key customer segments?", lines=2 ) insights_btn = gr.Button("💡 Generate Insights", variant="primary") insights_output = gr.Markdown(label="Business Insights") insights_btn.click(generate_insights, inputs=[question_input], outputs=[insights_output]) with gr.Tab("📈 Data Visualization"): with gr.Row(): chart_type = gr.Dropdown( choices=["Scatter Plot", "Line Chart", "Bar Chart", "Histogram", "Box Plot"], label="Chart Type", value="Scatter Plot" ) refresh_cols = gr.Button("🔄 Refresh Columns") with gr.Row(): x_column = gr.Dropdown(choices=[], label="X-axis Column") y_column = gr.Dropdown(choices=[], label="Y-axis Column (optional for some charts)") create_viz_btn = gr.Button("📊 Create Visualization", variant="primary") viz_output = gr.Image(label="Visualization") refresh_cols.click(get_columns, outputs=[x_column, y_column]) create_viz_btn.click(create_chart, inputs=[chart_type, x_column, y_column], outputs=[viz_output]) with gr.Tab("ℹ️ How to Use"): gr.Markdown(""" ## 🚀 How to Use Business Analyst GPT ### Step 1: Upload Your Dataset - Click on "Dataset Analysis" tab - Upload a CSV or Excel file containing your business data - Click "Analyze Dataset" to get comprehensive insights ### Step 2: Get ML Model Recommendations After uploading, you'll receive: - **Target Variable Suggestions**: Which columns can be predicted - **Feature Variable Identification**: Which columns to use as predictors - **Model Recommendations**: Best ML algorithms for your data - **Expected Performance**: Accuracy estimates for each model ### Step 3: Get Specific Visualization Ideas The analysis will provide: - **Single Variable Charts**: Best charts for each column - **Two Variable Relationships**: Specific X-axis and Y-axis recommendations - **Advanced Visualizations**: Multi-variable analysis suggestions - **Dashboard Layout**: How to organize your charts ### Step 4: Generate Business Insights - Ask specific business questions about your data - Get data-driven recommendations and insights - Receive actionable business strategies ### Step 5: Create Visualizations - Choose from various chart types - Select specific columns for X and Y axes - Generate publication-ready charts ## 📋 Supported File Types - CSV files (.csv) - Excel files (.xlsx, .xls) ## 🎯 Best Practices 1. **Clean Data**: Ensure your dataset has clear column headers 2. **Relevant Questions**: Ask specific business questions for better insights 3. **Column Selection**: Choose appropriate columns for visualizations 4. **Data Size**: Larger datasets provide more reliable ML recommendations """) # Launch the app if __name__ == "__main__": demo.launch()