charanKompala's picture
Update app.py
8081e42 verified
raw
history blame
25.3 kB
import gradio as gr
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import io
import base64
import warnings
warnings.filterwarnings('ignore')
class BusinessAnalystGPT:
def __init__(self):
self.df = None
self.analysis_results = ""
def analyze_dataset(self, file):
"""Analyze uploaded dataset and provide comprehensive insights"""
try:
# Read the dataset
if file.name.endswith('.csv'):
self.df = pd.read_csv(file.name)
elif file.name.endswith(('.xlsx', '.xls')):
self.df = pd.read_excel(file.name)
else:
return "Error: Please upload a CSV or Excel file."
# Basic dataset info
analysis = f"""
# ๐Ÿ“Š DATASET ANALYSIS REPORT
## ๐Ÿ“ˆ Basic Information
- **Dataset Shape**: {self.df.shape[0]} rows ร— {self.df.shape[1]} columns
- **Memory Usage**: {self.df.memory_usage(deep=True).sum() / 1024:.2f} KB
- **Missing Values**: {self.df.isnull().sum().sum()} total
## ๐Ÿ“‹ Column Information
"""
# Column details
for i, col in enumerate(self.df.columns):
dtype = str(self.df[col].dtype)
missing = self.df[col].isnull().sum()
unique_vals = self.df[col].nunique()
analysis += f"\n**{i+1}. {col}**\n"
analysis += f" - Data Type: {dtype}\n"
analysis += f" - Missing Values: {missing} ({missing/len(self.df)*100:.1f}%)\n"
analysis += f" - Unique Values: {unique_vals}\n"
if dtype in ['int64', 'float64']:
analysis += f" - Range: {self.df[col].min():.2f} to {self.df[col].max():.2f}\n"
analysis += f" - Mean: {self.df[col].mean():.2f}\n"
elif dtype == 'object':
top_values = self.df[col].value_counts().head(3)
analysis += f" - Top Values: {list(top_values.index)}\n"
# Add ML Model Recommendations
analysis += self._get_ml_recommendations()
# Add Visualization Recommendations
analysis += self._get_visualization_recommendations()
self.analysis_results = analysis
return analysis
except Exception as e:
return f"Error analyzing dataset: {str(e)}"
def _get_ml_recommendations(self):
"""Analyze dataset and recommend suitable ML models with variable suggestions"""
if self.df is None:
return ""
ml_analysis = "\n\n## ๐Ÿค– MACHINE LEARNING MODEL RECOMMENDATIONS\n\n"
# Identify variable types
numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = self.df.select_dtypes(include=['object']).columns.tolist()
ml_analysis += "### ๐ŸŽฏ Potential Target Variables (Dependent Variables):\n"
# Suggest target variables based on data characteristics
target_suggestions = []
for col in numeric_cols:
unique_ratio = self.df[col].nunique() / len(self.df)
if unique_ratio < 0.1 and self.df[col].nunique() <= 10:
target_suggestions.append((col, "Classification", f"Has {self.df[col].nunique()} unique values - good for classification"))
elif unique_ratio > 0.1:
target_suggestions.append((col, "Regression", "Continuous values - suitable for regression"))
for col in categorical_cols:
if self.df[col].nunique() <= 10:
target_suggestions.append((col, "Classification", f"Categorical with {self.df[col].nunique()} classes"))
if target_suggestions:
for var, task_type, reason in target_suggestions:
ml_analysis += f"- **{var}** ({task_type}): {reason}\n"
else:
ml_analysis += "- No clear target variables identified. Please specify based on your business objective.\n"
ml_analysis += "\n### ๐Ÿ“Š Feature Variables (Independent Variables):\n"
# List potential feature variables
all_cols = list(self.df.columns)
if len(numeric_cols) > 0:
ml_analysis += f"- **Numeric Features**: {', '.join(numeric_cols)}\n"
if len(categorical_cols) > 0:
ml_analysis += f"- **Categorical Features**: {', '.join(categorical_cols)}\n"
# Model recommendations based on data characteristics
ml_analysis += "\n### ๐Ÿ”ฎ Recommended Models & Expected Performance:\n\n"
# Classification models
if any("Classification" in suggestion[1] for suggestion in target_suggestions):
ml_analysis += "#### ๐ŸŽฏ For Classification Tasks:\n"
ml_analysis += """
1. **Random Forest Classifier** โญโญโญโญโญ
- Expected Accuracy: 85-95%
- Best for: Mixed data types, feature importance
- Pros: Handles missing values, no overfitting
2. **Logistic Regression** โญโญโญโญ
- Expected Accuracy: 75-85%
- Best for: Linear relationships, interpretability
- Pros: Fast, interpretable coefficients
3. **Decision Tree** โญโญโญ
- Expected Accuracy: 70-80%
- Best for: Rule-based decisions, interpretability
- Pros: Easy to understand and visualize
4. **Support Vector Machine (SVM)** โญโญโญโญ
- Expected Accuracy: 80-90%
- Best for: High-dimensional data, small datasets
- Pros: Effective for complex patterns
5. **K-Nearest Neighbors (KNN)** โญโญโญ
- Expected Accuracy: 70-85%
- Best for: Simple patterns, small datasets
- Pros: Simple, no assumptions about data
"""
# Regression models
if any("Regression" in suggestion[1] for suggestion in target_suggestions):
ml_analysis += "\n#### ๐Ÿ“ˆ For Regression Tasks:\n"
ml_analysis += """
1. **Random Forest Regressor** โญโญโญโญโญ
- Expected Rยฒ Score: 0.80-0.95
- Best for: Non-linear relationships, feature importance
- Pros: Robust, handles outliers well
2. **Linear Regression** โญโญโญโญ
- Expected Rยฒ Score: 0.70-0.85
- Best for: Linear relationships, interpretability
- Pros: Fast, interpretable, baseline model
3. **Support Vector Regression (SVR)** โญโญโญโญ
- Expected Rยฒ Score: 0.75-0.90
- Best for: Non-linear patterns, robust predictions
- Pros: Effective for complex relationships
4. **Decision Tree Regressor** โญโญโญ
- Expected Rยฒ Score: 0.65-0.80
- Best for: Non-linear, interpretable rules
- Pros: Easy to understand decision path
"""
# Data preprocessing recommendations
ml_analysis += "\n### ๐Ÿ› ๏ธ Data Preprocessing Recommendations:\n"
missing_data = self.df.isnull().sum().sum()
if missing_data > 0:
ml_analysis += f"- **Handle Missing Data**: {missing_data} missing values need attention\n"
if len(categorical_cols) > 0:
ml_analysis += "- **Encode Categorical Variables**: Use Label Encoding or One-Hot Encoding\n"
if len(numeric_cols) > 1:
ml_analysis += "- **Feature Scaling**: Consider StandardScaler for SVM/KNN models\n"
outliers_detected = False
for col in numeric_cols:
Q1 = self.df[col].quantile(0.25)
Q3 = self.df[col].quantile(0.75)
IQR = Q3 - Q1
outliers = ((self.df[col] < (Q1 - 1.5 * IQR)) | (self.df[col] > (Q3 + 1.5 * IQR))).sum()
if outliers > len(self.df) * 0.05: # More than 5% outliers
outliers_detected = True
break
if outliers_detected:
ml_analysis += "- **Handle Outliers**: Detected outliers that may affect model performance\n"
return ml_analysis
def _get_visualization_recommendations(self):
"""Provide specific chart recommendations for variables"""
if self.df is None:
return ""
viz_analysis = "\n\n## ๐Ÿ“Š DATA VISUALIZATION RECOMMENDATIONS\n\n"
numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = self.df.select_dtypes(include=['object']).columns.tolist()
# Single variable visualizations
viz_analysis += "### ๐Ÿ“ˆ Single Variable Analysis:\n\n"
for col in numeric_cols:
viz_analysis += f"**{col}** (Numeric):\n"
viz_analysis += f"- **Histogram**: Show distribution of {col}\n"
viz_analysis += f"- **Box Plot**: Identify outliers in {col}\n"
viz_analysis += f"- **Density Plot**: Smooth distribution curve for {col}\n\n"
for col in categorical_cols:
unique_count = self.df[col].nunique()
viz_analysis += f"**{col}** (Categorical - {unique_count} categories):\n"
if unique_count <= 10:
viz_analysis += f"- **Bar Chart**: Count of each category in {col}\n"
viz_analysis += f"- **Pie Chart**: Proportion of categories in {col}\n"
else:
viz_analysis += f"- **Bar Chart**: Top 10 categories in {col}\n"
viz_analysis += f"- **Donut Chart**: Alternative to pie chart for {col}\n\n"
# Two variable relationships
if len(self.df.columns) > 1:
viz_analysis += "### ๐Ÿ”— Two Variable Relationships:\n\n"
# Numeric vs Numeric
if len(numeric_cols) >= 2:
viz_analysis += "**Numeric vs Numeric Combinations:**\n"
for i in range(len(numeric_cols)):
for j in range(i+1, len(numeric_cols)):
col1, col2 = numeric_cols[i], numeric_cols[j]
viz_analysis += f"- **Scatter Plot**: {col1} (X-axis) vs {col2} (Y-axis)\n"
viz_analysis += f"- **Correlation Heatmap**: Relationship strength between {col1} and {col2}\n"
viz_analysis += "\n"
# Categorical vs Numeric
if len(categorical_cols) > 0 and len(numeric_cols) > 0:
viz_analysis += "**Categorical vs Numeric Combinations:**\n"
for cat_col in categorical_cols:
for num_col in numeric_cols:
viz_analysis += f"- **Box Plot**: {cat_col} (X-axis) vs {num_col} (Y-axis)\n"
viz_analysis += f"- **Violin Plot**: Distribution of {num_col} across {cat_col} categories\n"
viz_analysis += f"- **Bar Plot**: Average {num_col} by {cat_col}\n"
viz_analysis += "\n"
# Categorical vs Categorical
if len(categorical_cols) >= 2:
viz_analysis += "**Categorical vs Categorical Combinations:**\n"
for i in range(len(categorical_cols)):
for j in range(i+1, len(categorical_cols)):
col1, col2 = categorical_cols[i], categorical_cols[j]
viz_analysis += f"- **Stacked Bar Chart**: {col1} (X-axis) stacked by {col2}\n"
viz_analysis += f"- **Heatmap**: Cross-tabulation of {col1} vs {col2}\n"
viz_analysis += f"- **Grouped Bar Chart**: {col1} grouped by {col2}\n"
viz_analysis += "\n"
# Advanced visualizations
if len(self.df.columns) >= 3:
viz_analysis += "### ๐ŸŽจ Advanced Multi-Variable Analysis:\n\n"
if len(numeric_cols) >= 3:
viz_analysis += "**For 3+ Numeric Variables:**\n"
viz_analysis += f"- **3D Scatter Plot**: {numeric_cols[0]} (X) vs {numeric_cols[1]} (Y) vs {numeric_cols[2]} (Z)\n"
viz_analysis += f"- **Pair Plot**: All numeric variables against each other\n"
viz_analysis += f"- **Correlation Matrix**: Heatmap of all numeric correlations\n\n"
if len(numeric_cols) >= 2 and len(categorical_cols) >= 1:
viz_analysis += "**For Mixed Variable Types:**\n"
viz_analysis += f"- **Scatter Plot with Color**: {numeric_cols[0]} vs {numeric_cols[1]} colored by {categorical_cols[0]}\n"
viz_analysis += f"- **Bubble Chart**: {numeric_cols[0]} (X) vs {numeric_cols[1]} (Y) with bubble size from another variable\n\n"
# Dashboard recommendations
viz_analysis += "### ๐Ÿ“‹ Dashboard Layout Suggestions:\n\n"
viz_analysis += "**Top Row**: Overview metrics and key KPIs\n"
viz_analysis += "**Middle Section**: Main analysis charts (2-3 key visualizations)\n"
viz_analysis += "**Bottom Section**: Detailed breakdowns and filters\n"
viz_analysis += "**Side Panel**: Interactive filters and controls\n"
return viz_analysis
def generate_business_insights(self, question):
"""Generate business insights based on the question and dataset"""
if self.df is None:
return "Please upload a dataset first to generate insights."
insights = f"""
# ๐Ÿ’ก BUSINESS INSIGHTS & RECOMMENDATIONS
## Question: {question}
## ๐Ÿ“Š Data-Driven Analysis:
"""
# Basic statistics
numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = self.df.select_dtypes(include=['object']).columns.tolist()
if len(numeric_cols) > 0:
insights += "\n### ๐Ÿ“ˆ Key Metrics:\n"
for col in numeric_cols[:5]: # Show top 5 numeric columns
mean_val = self.df[col].mean()
median_val = self.df[col].median()
std_val = self.df[col].std()
insights += f"- **{col}**: Mean = {mean_val:.2f}, Median = {median_val:.2f}, Std = {std_val:.2f}\n"
if len(categorical_cols) > 0:
insights += "\n### ๐Ÿ“‹ Category Distribution:\n"
for col in categorical_cols[:3]: # Show top 3 categorical columns
top_category = self.df[col].mode()[0]
category_count = self.df[col].value_counts().iloc[0]
total_count = len(self.df)
percentage = (category_count / total_count) * 100
insights += f"- **{col}**: Most common = '{top_category}' ({category_count}/{total_count} = {percentage:.1f}%)\n"
# Generate recommendations based on question keywords
question_lower = question.lower()
if any(word in question_lower for word in ['revenue', 'sales', 'profit', 'income']):
insights += "\n### ๐Ÿ’ฐ Revenue/Sales Insights:\n"
insights += "- Focus on high-performing segments identified in the data\n"
insights += "- Analyze seasonal trends if time data is available\n"
insights += "- Consider customer segmentation based on purchase behavior\n"
elif any(word in question_lower for word in ['customer', 'client', 'user']):
insights += "\n### ๐Ÿ‘ฅ Customer Insights:\n"
insights += "- Segment customers based on key characteristics\n"
insights += "- Identify high-value customer profiles\n"
insights += "- Analyze customer retention patterns\n"
elif any(word in question_lower for word in ['marketing', 'campaign', 'advertising']):
insights += "\n### ๐Ÿ“ข Marketing Insights:\n"
insights += "- Evaluate campaign performance metrics\n"
insights += "- Identify most effective channels\n"
insights += "- Optimize targeting based on demographic data\n"
elif any(word in question_lower for word in ['predict', 'forecast', 'future']):
insights += "\n### ๐Ÿ”ฎ Predictive Insights:\n"
insights += "- Use historical patterns for forecasting\n"
insights += "- Apply machine learning models for predictions\n"
insights += "- Consider external factors that might influence outcomes\n"
else:
insights += "\n### ๐ŸŽฏ General Business Recommendations:\n"
insights += "- Identify key performance indicators from your data\n"
insights += "- Look for correlations between important variables\n"
insights += "- Consider segmentation strategies based on data patterns\n"
# Add data quality assessment
missing_data_pct = (self.df.isnull().sum().sum() / (self.df.shape[0] * self.df.shape[1])) * 100
insights += f"\n### โš ๏ธ Data Quality Notes:\n"
insights += f"- Missing data: {missing_data_pct:.1f}% of total data points\n"
insights += f"- Data completeness: {100-missing_data_pct:.1f}%\n"
if missing_data_pct > 10:
insights += "- **Recommendation**: Address missing data before making critical decisions\n"
return insights
def create_visualization(self, chart_type, x_column, y_column):
"""Create visualizations based on user selection"""
if self.df is None:
return "Please upload a dataset first."
try:
plt.figure(figsize=(10, 6))
plt.style.use('default')
if chart_type == "Scatter Plot":
plt.scatter(self.df[x_column], self.df[y_column], alpha=0.6)
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.title(f'Scatter Plot: {x_column} vs {y_column}')
elif chart_type == "Line Chart":
plt.plot(self.df[x_column], self.df[y_column])
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.title(f'Line Chart: {x_column} vs {y_column}')
elif chart_type == "Bar Chart":
if self.df[x_column].dtype == 'object':
value_counts = self.df[x_column].value_counts().head(10)
plt.bar(value_counts.index, value_counts.values)
plt.xlabel(x_column)
plt.ylabel('Count')
plt.title(f'Bar Chart: {x_column}')
plt.xticks(rotation=45)
else:
plt.bar(self.df[x_column], self.df[y_column])
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.title(f'Bar Chart: {x_column} vs {y_column}')
elif chart_type == "Histogram":
plt.hist(self.df[x_column], bins=30, alpha=0.7)
plt.xlabel(x_column)
plt.ylabel('Frequency')
plt.title(f'Histogram: {x_column}')
elif chart_type == "Box Plot":
if y_column and self.df[y_column].dtype == 'object':
self.df.boxplot(column=x_column, by=y_column)
plt.title(f'Box Plot: {x_column} by {y_column}')
else:
plt.boxplot(self.df[x_column].dropna())
plt.ylabel(x_column)
plt.title(f'Box Plot: {x_column}')
plt.tight_layout()
# Save plot to bytes
img_buffer = io.BytesIO()
plt.savefig(img_buffer, format='png', dpi=150, bbox_inches='tight')
img_buffer.seek(0)
plt.close()
return img_buffer.getvalue()
except Exception as e:
return f"Error creating visualization: {str(e)}"
# Initialize the Business Analyst GPT
analyst = BusinessAnalystGPT()
# Define the Gradio interface
def analyze_file(file):
return analyst.analyze_dataset(file)
def generate_insights(question):
return analyst.generate_business_insights(question)
def create_chart(chart_type, x_col, y_col):
result = analyst.create_visualization(chart_type, x_col, y_col)
if isinstance(result, bytes):
return result
else:
return result
def get_columns():
if analyst.df is not None:
return gr.update(choices=list(analyst.df.columns)), gr.update(choices=list(analyst.df.columns))
return gr.update(choices=[]), gr.update(choices=[])
# Create the Gradio interface
with gr.Blocks(title="Business Analyst GPT", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# ๐Ÿค– Business Analyst GPT
### Your AI-Powered Data Analysis Assistant
Upload your dataset and get comprehensive business insights, ML model recommendations, and visualization suggestions!
""")
with gr.Tab("๐Ÿ“Š Dataset Analysis"):
with gr.Row():
file_input = gr.File(label="Upload your dataset (CSV or Excel)", file_types=[".csv", ".xlsx", ".xls"])
analyze_btn = gr.Button("๐Ÿ” Analyze Dataset", variant="primary")
analysis_output = gr.Markdown(label="Analysis Results")
analyze_btn.click(analyze_file, inputs=[file_input], outputs=[analysis_output])
with gr.Tab("๐Ÿ’ก Business Insights"):
with gr.Row():
question_input = gr.Textbox(
label="Ask a business question about your data",
placeholder="e.g., How can I increase revenue? What are the key customer segments?",
lines=2
)
insights_btn = gr.Button("๐Ÿ’ก Generate Insights", variant="primary")
insights_output = gr.Markdown(label="Business Insights")
insights_btn.click(generate_insights, inputs=[question_input], outputs=[insights_output])
with gr.Tab("๐Ÿ“ˆ Data Visualization"):
with gr.Row():
chart_type = gr.Dropdown(
choices=["Scatter Plot", "Line Chart", "Bar Chart", "Histogram", "Box Plot"],
label="Chart Type",
value="Scatter Plot"
)
refresh_cols = gr.Button("๐Ÿ”„ Refresh Columns")
with gr.Row():
x_column = gr.Dropdown(choices=[], label="X-axis Column")
y_column = gr.Dropdown(choices=[], label="Y-axis Column (optional for some charts)")
create_viz_btn = gr.Button("๐Ÿ“Š Create Visualization", variant="primary")
viz_output = gr.Image(label="Visualization")
refresh_cols.click(get_columns, outputs=[x_column, y_column])
create_viz_btn.click(create_chart, inputs=[chart_type, x_column, y_column], outputs=[viz_output])
with gr.Tab("โ„น๏ธ How to Use"):
gr.Markdown("""
## ๐Ÿš€ How to Use Business Analyst GPT
### Step 1: Upload Your Dataset
- Click on "Dataset Analysis" tab
- Upload a CSV or Excel file containing your business data
- Click "Analyze Dataset" to get comprehensive insights
### Step 2: Get ML Model Recommendations
After uploading, you'll receive:
- **Target Variable Suggestions**: Which columns can be predicted
- **Feature Variable Identification**: Which columns to use as predictors
- **Model Recommendations**: Best ML algorithms for your data
- **Expected Performance**: Accuracy estimates for each model
### Step 3: Get Specific Visualization Ideas
The analysis will provide:
- **Single Variable Charts**: Best charts for each column
- **Two Variable Relationships**: Specific X-axis and Y-axis recommendations
- **Advanced Visualizations**: Multi-variable analysis suggestions
- **Dashboard Layout**: How to organize your charts
### Step 4: Generate Business Insights
- Ask specific business questions about your data
- Get data-driven recommendations and insights
- Receive actionable business strategies
### Step 5: Create Visualizations
- Choose from various chart types
- Select specific columns for X and Y axes
- Generate publication-ready charts
## ๐Ÿ“‹ Supported File Types
- CSV files (.csv)
- Excel files (.xlsx, .xls)
## ๐ŸŽฏ Best Practices
1. **Clean Data**: Ensure your dataset has clear column headers
2. **Relevant Questions**: Ask specific business questions for better insights
3. **Column Selection**: Choose appropriate columns for visualizations
4. **Data Size**: Larger datasets provide more reliable ML recommendations
""")
# Launch the app
if __name__ == "__main__":
demo.launch()