charanKompala commited on
Commit
8081e42
·
verified ·
1 Parent(s): 1cee08b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +528 -746
app.py CHANGED
@@ -1,768 +1,550 @@
1
  import gradio as gr
2
- import json
3
- from datetime import datetime
4
- import os
5
-
6
- # Business Analyst templates and responses
7
- BA_TEMPLATES = {
8
- "requirements": """
9
- # Business Requirements Document
10
-
11
- ## Project Overview
12
- **Project:** {project_name}
13
- **Date:** {date}
14
- **Business Analyst:** {analyst_name}
15
-
16
- ## Executive Summary
17
- {executive_summary}
18
-
19
- ## Business Objectives
20
- {objectives}
21
-
22
- ## Functional Requirements
23
- {functional_requirements}
24
-
25
- ## Non-Functional Requirements
26
- {non_functional_requirements}
27
-
28
- ## Acceptance Criteria
29
- {acceptance_criteria}
30
-
31
- ## Assumptions and Dependencies
32
- {assumptions}
33
-
34
- ## Risks and Mitigation
35
- {risks}
36
- """,
37
-
38
- "user_story": """
39
- # User Story
40
-
41
- **As a** {user_type}
42
- **I want** {want}
43
- **So that** {benefit}
44
-
45
- ## Acceptance Criteria
46
- {acceptance_criteria}
47
-
48
- ## Definition of Done
49
- {definition_of_done}
50
-
51
- ## Priority:** {priority}
52
- **Story Points:** {story_points}
53
- """,
54
-
55
- "process_flow": """
56
- # Business Process Flow
57
-
58
- ## Process Name: {process_name}
59
-
60
- ## Process Overview
61
- {overview}
62
-
63
- ## Process Steps:
64
- {steps}
65
-
66
- ## Stakeholders Involved:
67
- {stakeholders}
68
-
69
- ## Systems/Tools Used:
70
- {systems}
71
-
72
- ## Key Performance Indicators (KPIs):
73
- {kpis}
74
-
75
- ## Process Improvements:
76
- {improvements}
77
- """,
78
-
79
- "gap_analysis": """
80
- # Gap Analysis Report
81
-
82
- ## Current State Analysis
83
- {current_state}
84
-
85
- ## Future State Vision
86
- {future_state}
87
-
88
- ## Identified Gaps
89
- {gaps}
90
-
91
- ## Impact Assessment
92
- {impact}
93
-
94
- ## Recommendations
95
- {recommendations}
96
-
97
- ## Implementation Roadmap
98
- {roadmap}
99
- """,
100
-
101
- "stakeholder_analysis": """
102
- # Stakeholder Analysis
103
-
104
- ## Project: {project_name}
105
-
106
- ## Stakeholder Matrix
107
-
108
- ### High Influence, High Interest (Manage Closely)
109
- {high_high}
110
-
111
- ### High Influence, Low Interest (Keep Satisfied)
112
- {high_low}
113
-
114
- ### Low Influence, High Interest (Keep Informed)
115
- {low_high}
116
-
117
- ### Low Influence, Low Interest (Monitor)
118
- {low_low}
119
-
120
- ## Communication Plan
121
- {communication_plan}
122
-
123
- ## Engagement Strategy
124
- {engagement_strategy}
125
- """,
126
-
127
- "data_analysis": """
128
- # Data Analysis Report
129
-
130
- ## Dataset Overview
131
- **Dataset Name:** {dataset_name}
132
- **Shape:** {shape}
133
- **Upload Date:** {date}
134
-
135
- ## Column Analysis
136
- {column_analysis}
137
-
138
- ## Machine Learning Model Recommendations
139
-
140
- ### 🤖 Suitable ML Models:
141
- {ml_recommendations}
142
-
143
- ### 📊 Variable Identification:
144
- **Potential Dependent Variables (Target):**
145
- {dependent_vars}
146
-
147
- **Potential Independent Variables (Features):**
148
- {independent_vars}
149
-
150
- ### 📈 Data Visualization Recommendations:
151
- {viz_recommendations}
152
 
153
- ## Data Quality Assessment
154
- {data_quality}
 
 
155
 
156
- ## Next Steps
157
- {next_steps}
158
  """
159
- }
160
-
161
- def analyze_dataset(file):
162
- """Analyze uploaded dataset and provide ML recommendations"""
163
- if file is None:
164
- return "Please upload a dataset file (CSV format)"
165
-
166
- try:
167
- # Read the CSV file content
168
- with open(file, 'r', encoding='utf-8') as f:
169
- content = f.read()
170
-
171
- # Parse CSV manually (simple approach)
172
- lines = content.strip().split('\n')
173
- if len(lines) < 2:
174
- return "Dataset appears to be empty or invalid"
175
-
176
- # Get headers
177
- headers = [col.strip().strip('"') for col in lines[0].split(',')]
178
- data_rows = lines[1:]
179
-
180
- # Basic dataset info
181
- dataset_name = os.path.basename(file)
182
- shape = f"{len(data_rows)} rows × {len(headers)} columns"
183
-
184
- # Analyze columns
185
- column_analysis = analyze_columns_simple(headers, data_rows)
186
-
187
- # ML model recommendations
188
- ml_recommendations = recommend_ml_models_simple(headers, data_rows)
189
-
190
- # Variable identification
191
- dependent_vars, independent_vars = identify_variables_simple(headers, data_rows)
192
-
193
- # Visualization recommendations
194
- viz_recommendations = recommend_visualizations_simple(headers, data_rows)
195
-
196
- # Data quality assessment
197
- data_quality = assess_data_quality_simple(headers, data_rows)
198
-
199
- # Next steps
200
- next_steps = generate_next_steps_simple()
201
-
202
- # Generate report
203
- template = BA_TEMPLATES["data_analysis"]
204
- return template.format(
205
- dataset_name=dataset_name,
206
- shape=shape,
207
- date=datetime.now().strftime("%Y-%m-%d"),
208
- column_analysis=column_analysis,
209
- ml_recommendations=ml_recommendations,
210
- dependent_vars=dependent_vars,
211
- independent_vars=independent_vars,
212
- viz_recommendations=viz_recommendations,
213
- data_quality=data_quality,
214
- next_steps=next_steps
215
- )
216
-
217
- except Exception as e:
218
- return f"Error analyzing dataset: {str(e)}. Please ensure your file is a valid CSV format."
219
-
220
- def analyze_columns_simple(headers, data_rows):
221
- """Simple column analysis without pandas"""
222
- analysis = []
223
 
224
- for i, col in enumerate(headers):
225
- # Get sample values for this column
226
- values = []
227
- for row in data_rows[:100]: # Sample first 100 rows
228
- row_data = [cell.strip().strip('"') for cell in row.split(',')]
229
- if i < len(row_data):
230
- values.append(row_data[i])
231
-
232
- # Basic analysis
233
- non_empty_values = [v for v in values if v and v != '']
234
- unique_values = len(set(non_empty_values))
235
- missing_count = len(values) - len(non_empty_values)
236
-
237
- # Determine data type
238
- is_numeric = all(is_number(v) for v in non_empty_values[:10] if v)
239
-
240
- if is_numeric:
241
- col_type = "Numerical"
242
- numeric_vals = [float(v) for v in non_empty_values if is_number(v)]
243
- if numeric_vals:
244
- stats = f"Range: {min(numeric_vals)} to {max(numeric_vals)}, Avg: {sum(numeric_vals)/len(numeric_vals):.2f}"
245
- else:
246
- stats = "No valid numeric data"
 
 
 
 
 
 
 
247
  else:
248
- col_type = "Categorical/Text"
249
- common_values = list(set(non_empty_values[:10]))[:3]
250
- stats = f"Sample values: {common_values}"
251
 
252
- missing_pct = round((missing_count / len(values)) * 100, 1) if values else 0
253
 
254
- analysis.append(f"**{col}** ({col_type}): {unique_values} unique values, {missing_pct}% missing\n └─ {stats}")
255
-
256
- return "\n\n".join(analysis)
257
-
258
- def is_number(s):
259
- """Check if string represents a number"""
260
- try:
261
- float(s)
262
- return True
263
- except (ValueError, TypeError):
264
- return False
265
-
266
- def recommend_ml_models_simple(headers, data_rows):
267
- """Simple ML model recommendations"""
268
- recommendations = []
269
-
270
- # Count estimated numerical and categorical columns
271
- numerical_cols = []
272
- categorical_cols = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
- for i, col in enumerate(headers):
275
- # Sample some values to determine type
276
- sample_values = []
277
- for row in data_rows[:50]:
278
- row_data = [cell.strip().strip('"') for cell in row.split(',')]
279
- if i < len(row_data) and row_data[i]:
280
- sample_values.append(row_data[i])
281
-
282
- if sample_values:
283
- numeric_count = sum(1 for v in sample_values if is_number(v))
284
- if numeric_count > len(sample_values) * 0.7: # 70% numeric
285
- numerical_cols.append(col)
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  else:
287
- categorical_cols.append(col)
288
-
289
- total_rows = len(data_rows)
290
-
291
- # Classification models recommendation
292
- if categorical_cols:
293
- recommendations.append("""
294
- **🎯 Classification Models** (Suitable for predicting categories)
295
- • **Logistic Regression** - Good for binary classification, interpretable results
296
- • **Random Forest** - Handles mixed data types well, provides feature importance
297
- • **Decision Tree** - Easy to interpret, good for creating business rules
298
- • **Support Vector Machine (SVM)** - Effective for high-dimensional data
299
- • **Gradient Boosting (XGBoost)** - High accuracy, handles missing values well
300
- • **Neural Networks** - For complex patterns (recommended if dataset > 1000 rows)
301
-
302
- *Best for: Predicting categories like Yes/No, High/Medium/Low, Customer segments*""")
303
-
304
- # Regression models recommendation
305
- if numerical_cols:
306
- recommendations.append("""
307
- **📈 Regression Models** (Suitable for predicting continuous values)
308
- • **Linear Regression** - Simple, interpretable, good as baseline model
309
- • **Polynomial Regression** - For capturing non-linear relationships
310
- • **Random Forest Regressor** - Robust to outliers, handles mixed data types
311
- • **Ridge/Lasso Regression** - Good for high-dimensional data, prevents overfitting
312
- • **Gradient Boosting Regressor** - High accuracy for complex patterns
313
- • **Neural Networks** - For complex non-linear relationships
314
-
315
- *Best for: Predicting prices, quantities, scores, measurements, forecasts*""")
316
-
317
- # Clustering models
318
- if total_rows > 50:
319
- recommendations.append("""
320
- **🔍 Clustering Models** (Suitable for finding hidden patterns)
321
- • **K-Means Clustering** - Good for customer segmentation, market analysis
322
- • **Hierarchical Clustering** - Creates tree-like cluster structure
323
- • **DBSCAN** - Finds clusters of varying shapes and sizes, handles noise
324
-
325
- *Best for: Customer segmentation, market analysis, pattern discovery*""")
326
-
327
- # Time series recommendation
328
- date_keywords = ['date', 'time', 'year', 'month', 'day', 'timestamp']
329
- has_date_column = any(keyword in col.lower() for col in headers for keyword in date_keywords)
330
-
331
- if has_date_column:
332
- recommendations.append("""
333
- **⏰ Time Series Models** (Suitable for temporal data)
334
- • **ARIMA** - Classical time series forecasting
335
- • **Prophet** - Good for seasonal patterns and holidays
336
- • **LSTM Neural Networks** - For complex temporal patterns
337
- • **Exponential Smoothing** - Simple but effective for trends
338
-
339
- *Best for: Sales forecasting, demand prediction, trend analysis*""")
340
-
341
- # Dataset size considerations
342
- if total_rows < 100:
343
- recommendations.append("""
344
- **⚠️ Dataset Size Consideration:**
345
- Your dataset is small (< 100 rows). Consider:
346
- • Simple models like Linear/Logistic Regression
347
- • Decision Trees with limited depth to avoid overfitting
348
- • Collecting more data for better model performance
349
- • Cross-validation for reliable performance estimates""")
350
- elif total_rows > 10000:
351
- recommendations.append("""
352
- **🚀 Large Dataset Advantages:**
353
- Your dataset is large (> 10,000 rows). You can use:
354
- • Complex models like Neural Networks and Deep Learning
355
- • Ensemble methods for higher accuracy
356
- • Advanced feature engineering techniques
357
- • Multiple model comparison and stacking""")
358
-
359
- return "\n".join(recommendations) if recommendations else "Unable to determine suitable models. Please check your dataset format."
360
-
361
- def identify_variables_simple(headers, data_rows):
362
- """Simple variable identification"""
363
- dependent_candidates = []
364
- independent_candidates = []
365
-
366
- # Look for potential target variables
367
- target_keywords = ['target', 'label', 'class', 'outcome', 'result', 'prediction', 'y']
368
-
369
- for col in headers:
370
- col_lower = col.lower()
371
-
372
- # Check if column name suggests it's a target
373
- if any(keyword in col_lower for keyword in target_keywords):
374
- dependent_candidates.append(f"• **{col}** - Column name suggests this is a target variable")
375
- continue
376
-
377
- # Sample values to determine if categorical with few categories
378
- sample_values = []
379
- for row in data_rows[:100]:
380
- row_data = [cell.strip().strip('"') for cell in row.split(',')]
381
- col_idx = headers.index(col)
382
- if col_idx < len(row_data) and row_data[col_idx]:
383
- sample_values.append(row_data[col_idx])
384
-
385
- unique_values = len(set(sample_values))
386
-
387
- # Potential categorical target (few unique values)
388
- if unique_values <= 10 and len(sample_values) > 0:
389
- sample_unique = list(set(sample_values))[:5]
390
- dependent_candidates.append(f"• **{col}** - Categorical with {unique_values} categories: {sample_unique}")
391
- # Potential numerical target
392
- elif is_number(sample_values[0]) if sample_values else False:
393
- if any(keyword in col_lower for keyword in ['price', 'amount', 'score', 'rating', 'value']):
394
- dependent_candidates.append(f"• **{col}** - Numerical variable suitable for regression")
395
-
396
- # All other columns as independent variables
397
- dep_var_names = [line.split('**')[1].split('**')[0] for line in dependent_candidates]
398
-
399
- for col in headers:
400
- if col not in dep_var_names:
401
- # Determine type
402
- sample_values = []
403
- for row in data_rows[:50]:
404
- row_data = [cell.strip().strip('"') for cell in row.split(',')]
405
- col_idx = headers.index(col)
406
- if col_idx < len(row_data) and row_data[col_idx]:
407
- sample_values.append(row_data[col_idx])
408
 
409
- if sample_values:
410
- is_numeric = all(is_number(v) for v in sample_values[:10])
411
- col_type = "Numerical" if is_numeric else "Categorical"
412
- unique_count = len(set(sample_values))
413
- independent_candidates.append(f"• **{col}** ({col_type}) - {unique_count} unique values")
414
-
415
- # Format output
416
- dep_vars = "\n".join(dependent_candidates) if dependent_candidates else "• No clear target variables identified automatically.\n• Consider which variable you want to predict based on your business objective."
417
-
418
- indep_vars = "\n".join(independent_candidates[:15]) if independent_candidates else "• All columns can potentially serve as features."
419
- if len(independent_candidates) > 15:
420
- indep_vars += f"\n• ... and {len(independent_candidates) - 15} more variables"
421
-
422
- return dep_vars, indep_vars
423
-
424
- def recommend_visualizations_simple(headers, data_rows):
425
- """Simple visualization recommendations"""
426
- viz_recommendations = []
427
-
428
- # Analyze column types
429
- numerical_cols = []
430
- categorical_cols = []
431
-
432
- for col in headers:
433
- # Sample values to determine type
434
- sample_values = []
435
- for row in data_rows[:50]:
436
- row_data = [cell.strip().strip('"') for cell in row.split(',')]
437
- col_idx = headers.index(col)
438
- if col_idx < len(row_data) and row_data[col_idx]:
439
- sample_values.append(row_data[col_idx])
440
-
441
- if sample_values:
442
- numeric_count = sum(1 for v in sample_values if is_number(v))
443
- if numeric_count > len(sample_values) * 0.7:
444
- numerical_cols.append(col)
445
- else:
446
- categorical_cols.append(col)
447
-
448
- # Recommendations for numerical variables
449
- if numerical_cols:
450
- viz_recommendations.append("**📊 For Numerical Variables:**")
451
- for col in numerical_cols[:5]: # Limit to first 5
452
- viz_recommendations.append(f"""
453
- • **{col}**:
454
- - **Histogram** - Show distribution pattern and identify outliers
455
- - **Box Plot** - Visualize quartiles, median, and outliers
456
- - **Line Chart** - Show trends over time (if sequential data)
457
- - **Scatter Plot** vs other numerical variables - Find correlations""")
458
-
459
- # Recommendations for categorical variables
460
- if categorical_cols:
461
- viz_recommendations.append("\n**📈 For Categorical Variables:**")
462
- for col in categorical_cols[:5]: # Limit to first 5
463
- # Count unique values
464
- sample_values = []
465
- for row in data_rows[:100]:
466
- row_data = [cell.strip().strip('"') for cell in row.split(',')]
467
- col_idx = headers.index(col)
468
- if col_idx < len(row_data) and row_data[col_idx]:
469
- sample_values.append(row_data[col_idx])
470
 
471
- unique_count = len(set(sample_values))
472
- viz_recommendations.append(f"""
473
- **{col}** ({unique_count} categories):
474
- - **Bar Chart** - Compare frequency/count of each category
475
- - **Pie Chart** - Show proportional breakdown (best if < 8 categories)
476
- - **Donut Chart** - Modern alternative to pie chart
477
- - **Horizontal Bar Chart** - Better for long category names""")
478
-
479
- # Relationship visualizations
480
- if len(numerical_cols) >= 2:
481
- viz_recommendations.append(f"""
482
- **🔗 For Relationships Between Variables:**
483
- **Correlation Heatmap** - Show relationships between all numerical variables
484
- **Scatter Plot Matrix** - Compare pairs of numerical variables
485
- **Pair Plot** - Detailed pairwise relationships with distributions""")
486
-
487
- if numerical_cols and categorical_cols:
488
- viz_recommendations.append(f"""
489
- **Box Plot by Category** - Compare {numerical_cols[0]} distribution across {categorical_cols[0]} categories
490
- **Violin Plot** - Show distribution shape across categories
491
- • **Bar Chart with Error Bars** - Show mean values with confidence intervals""")
492
-
493
- # Advanced visualizations
494
- if len(headers) >= 3:
495
- viz_recommendations.append(f"""
496
- **🎯 Advanced Visualizations:**
497
- **3D Scatter Plot** - Explore relationships between 3 variables
498
- **Bubble Chart** - Show 3 dimensions using x, y, and bubble size
499
- **Treemap** - For hierarchical categorical data
500
- **Sunburst Chart** - For nested categorical relationships
501
- • **Parallel Coordinates** - For multivariate data exploration""")
502
-
503
- # Dashboard recommendations
504
- viz_recommendations.append(f"""
505
- **📋 Dashboard & Interactive Elements:**
506
- • **KPI Cards** - Show key metrics and summary statistics
507
- **Filter Panels** - Allow users to slice and dice data
508
- **Trend Lines** - Add to charts to highlight patterns
509
- **Data Tables** - Show raw data with sorting and filtering
510
- **Dropdown Selectors** - For choosing variables to visualize""")
511
-
512
- return "\n".join(viz_recommendations)
513
-
514
- def assess_data_quality_simple(headers, data_rows):
515
- """Simple data quality assessment"""
516
- quality_issues = []
517
-
518
- # Check for missing values
519
- missing_analysis = []
520
- for col in headers:
521
- col_idx = headers.index(col)
522
- missing_count = 0
523
- total_count = 0
524
-
525
- for row in data_rows:
526
- row_data = [cell.strip().strip('"') for cell in row.split(',')]
527
- total_count += 1
528
- if col_idx >= len(row_data) or not row_data[col_idx] or row_data[col_idx] == '':
529
- missing_count += 1
530
-
531
- if missing_count > 0:
532
- missing_pct = round((missing_count / total_count) * 100, 1)
533
- missing_analysis.append(f" - {col}: {missing_pct}% missing ({missing_count}/{total_count})")
534
-
535
- if missing_analysis:
536
- quality_issues.append("**Missing Values Detected:**")
537
- quality_issues.extend(missing_analysis[:5]) # Show first 5
538
- if len(missing_analysis) > 5:
539
- quality_issues.append(f" - ... and {len(missing_analysis) - 5} more columns with missing data")
540
-
541
- # Check for potential duplicates (simple check)
542
- unique_rows = set()
543
- duplicate_count = 0
544
- for row in data_rows[:1000]: # Check first 1000 rows
545
- if row in unique_rows:
546
- duplicate_count += 1
547
- else:
548
- unique_rows.add(row)
549
-
550
- if duplicate_count > 0:
551
- quality_issues.append(f"**Potential Duplicate Rows:** {duplicate_count} duplicate rows detected in sample")
552
-
553
- # Check for inconsistent data formats
554
- format_issues = []
555
- for col in headers[:5]: # Check first 5 columns
556
- col_idx = headers.index(col)
557
- values = []
558
- for row in data_rows[:100]:
559
- row_data = [cell.strip().strip('"') for cell in row.split(',')]
560
- if col_idx < len(row_data) and row_data[col_idx]:
561
- values.append(row_data[col_idx])
562
-
563
- if values:
564
- # Check for mixed numeric/text in same column
565
- numeric_count = sum(1 for v in values if is_number(v))
566
- if 0 < numeric_count < len(values):
567
- format_issues.append(f" - {col}: Mixed data types (numeric and text)")
568
-
569
- if format_issues:
570
- quality_issues.append("**Data Format Issues:**")
571
- quality_issues.extend(format_issues)
572
-
573
- # Overall assessment
574
- if not quality_issues:
575
- return "✅ **Good Data Quality:** No major data quality issues detected in the sample."
576
- else:
577
- quality_issues.insert(0, "⚠️ **Data Quality Issues Found:**")
578
- quality_issues.append("\n**Recommendations:**")
579
- quality_issues.append("• Clean missing values before model training")
580
- quality_issues.append("• Remove or handle duplicate records")
581
- quality_issues.append("• Standardize data formats within columns")
582
- quality_issues.append("• Validate data types and convert as needed")
583
 
584
- return "\n".join(quality_issues)
 
 
 
 
 
 
585
 
586
- def generate_next_steps_simple():
587
- """Generate recommended next steps"""
588
- steps = [
589
- "1. **Data Cleaning:** Handle missing values, duplicates, and outliers",
590
- "2. **Data Exploration:** Create visualizations to understand patterns and relationships",
591
- "3. **Feature Engineering:** Create new variables from existing ones if needed",
592
- "4. **Variable Selection:** Choose the most relevant features for your model",
593
- "5. **Model Selection:** Pick appropriate ML model based on recommendations above",
594
- "6. **Data Splitting:** Divide data into training and testing sets (80/20 split)",
595
- "7. **Model Training:** Train your selected model with the training data",
596
- "8. **Model Evaluation:** Test model performance using appropriate metrics",
597
- "9. **Model Tuning:** Optimize hyperparameters for better performance",
598
- "10. **Model Deployment:** Implement the model for business use"
599
- ]
600
-
601
- return "\n".join(steps)
602
 
603
- def generate_ba_document(doc_type, user_input, dataset_file=None):
604
- """Generate business analyst document based on type and input"""
605
-
606
- # Handle dataset analysis
607
- if doc_type == "data_analysis":
608
- if dataset_file:
609
- return analyze_dataset(dataset_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
  else:
611
- return "Please upload a dataset file (CSV format) to perform data analysis."
612
-
613
- # Parse user input to extract key information
614
- lines = user_input.strip().split('\n')
615
- context = {}
616
-
617
- # Extract context from user input
618
- for line in lines:
619
- if ':' in line:
620
- key, value = line.split(':', 1)
621
- context[key.strip().lower().replace(' ', '_')] = value.strip()
622
-
623
- # Add default values
624
- context['date'] = datetime.now().strftime("%Y-%m-%d")
625
- context['analyst_name'] = context.get('analyst_name', 'Business Analyst')
626
-
627
- if doc_type == "requirements":
628
- return generate_requirements_doc(context, user_input)
629
- elif doc_type == "user_story":
630
- return generate_user_story(context, user_input)
631
- elif doc_type == "process_flow":
632
- return generate_process_flow(context, user_input)
633
- elif doc_type == "gap_analysis":
634
- return generate_gap_analysis(context, user_input)
635
- elif doc_type == "stakeholder_analysis":
636
- return generate_stakeholder_analysis(context, user_input)
637
- else:
638
- return "Please select a valid document type."
639
-
640
- def generate_requirements_doc(context, user_input):
641
- """Generate Business Requirements Document"""
642
-
643
- # Default values if not provided
644
- project_name = context.get('project_name', context.get('project', 'New Project'))
645
-
646
- # Generate content based on input
647
- if 'objective' in user_input.lower() or 'goal' in user_input.lower():
648
- objectives = extract_objectives(user_input)
649
- else:
650
- objectives = "• Improve business efficiency\n• Enhance user experience\n• Reduce operational costs"
651
-
652
- if 'requirement' in user_input.lower():
653
- functional_requirements = extract_requirements(user_input)
654
- else:
655
- functional_requirements = "• System shall allow users to [specific functionality]\n• System shall provide [specific feature]\n• System shall integrate with [existing systems]"
656
-
657
- template = BA_TEMPLATES["requirements"]
658
- return template.format(
659
- project_name=project_name,
660
- date=context['date'],
661
- analyst_name=context['analyst_name'],
662
- executive_summary=context.get('summary', 'This document outlines the business requirements for ' + project_name),
663
- objectives=objectives,
664
- functional_requirements=functional_requirements,
665
- non_functional_requirements="• Performance: Response time < 2 seconds\n• Security: Role-based access control\n• Scalability: Support 1000+ concurrent users",
666
- acceptance_criteria="• All functional requirements implemented\n• User acceptance testing completed\n• Performance benchmarks met",
667
- assumptions="• Users have basic computer literacy\n• Integration APIs are available\n• Project timeline is 6 months",
668
- risks="• Technical complexity may cause delays\n• User adoption challenges\n• Budget constraints"
669
- )
670
-
671
- def generate_user_story(context, user_input):
672
- """Generate User Story"""
673
-
674
- # Extract user story components
675
- user_type = context.get('user_type', context.get('as_a', 'end user'))
676
- want = context.get('want', context.get('i_want', 'perform a specific action'))
677
- benefit = context.get('benefit', context.get('so_that', 'achieve my goal efficiently'))
678
-
679
- template = BA_TEMPLATES["user_story"]
680
- return template.format(
681
- user_type=user_type,
682
- want=want,
683
- benefit=benefit,
684
- acceptance_criteria="• Given [precondition]\n• When [action]\n• Then [expected result]",
685
- definition_of_done="• Code reviewed and approved\n• Unit tests written and passing\n• Documentation updated\n• Deployed to staging environment",
686
- priority=context.get('priority', 'Medium'),
687
- story_points=context.get('story_points', 'TBD')
688
- )
689
-
690
- def generate_process_flow(context, user_input):
691
- """Generate Process Flow Document"""
692
-
693
- process_name = context.get('process_name', context.get('process', 'Business Process'))
694
-
695
- template = BA_TEMPLATES["process_flow"]
696
- return template.format(
697
- process_name=process_name,
698
- overview=context.get('overview', f'This document describes the {process_name} workflow and procedures.'),
699
- steps="1. Process initiation\n2. Data collection\n3. Analysis and review\n4. Decision making\n5. Implementation\n6. Monitoring and feedback",
700
- stakeholders="• Business Users\n• Process Owners\n• IT Support\n• Management",
701
- systems="• CRM System\n• ERP System\n• Document Management\n• Reporting Tools",
702
- kpis="• Process completion time\n• Error rate\n• Customer satisfaction\n• Cost per transaction",
703
- improvements="• Automation opportunities\n• Bottleneck elimination\n• Quality enhancements\n• Cost reduction initiatives"
704
- )
705
-
706
- def generate_gap_analysis(context, user_input):
707
- """Generate Gap Analysis Report"""
708
 
709
- template = BA_TEMPLATES["gap_analysis"]
710
- return template.format(
711
- current_state="Current business processes and systems analysis goes here...",
712
- future_state="Desired future state and target operating model goes here...",
713
- gaps="• Process inefficiencies\n• Technology limitations\n• Skill gaps\n• Resource constraints",
714
- impact="• Operational impact assessment\n• Financial implications\n• Risk evaluation\n• Timeline considerations",
715
- recommendations="• Short-term quick wins\n• Medium-term improvements\n• Long-term strategic initiatives\n• Resource allocation suggestions",
716
- roadmap="Phase 1: Foundation (Months 1-3)\nPhase 2: Implementation (Months 4-8)\nPhase 3: Optimization (Months 9-12)"
717
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
 
719
- def generate_stakeholder_analysis(context, user_input):
720
- """Generate Stakeholder Analysis"""
721
-
722
- project_name = context.get('project_name', context.get('project', 'Current Project'))
723
-
724
- template = BA_TEMPLATES["stakeholder_analysis"]
725
- return template.format(
726
- project_name=project_name,
727
- high_high="• Executive Sponsor\n• Project Manager\n• Key Business Users",
728
- high_low="• Senior Management\n• Department Heads\n• Regulatory Bodies",
729
- low_high="• End Users\n• Customer Representatives\n• Support Teams",
730
- low_low="• Vendors\n• External Consultants\n• Peripheral Teams",
731
- communication_plan="• Weekly status reports\n• Monthly steering committee meetings\n• Quarterly business reviews",
732
- engagement_strategy="• Regular one-on-one meetings\n• Focus groups and workshops\n• Change management activities"
733
- )
734
 
735
- def extract_objectives(text):
736
- """Extract objectives from user input"""
737
- objectives = []
738
- lines = text.split('\n')
739
- for line in lines:
740
- if any(keyword in line.lower() for keyword in ['objective', 'goal', 'aim', 'target']):
741
- objectives.append(f"• {line.strip()}")
742
-
743
- if not objectives:
744
- return "• Improve business efficiency\n• Enhance user experience\n• Reduce operational costs"
745
-
746
- return '\n'.join(objectives)
747
 
748
- def extract_requirements(text):
749
- """Extract requirements from user input"""
750
- requirements = []
751
- lines = text.split('\n')
752
- for line in lines:
753
- if any(keyword in line.lower() for keyword in ['requirement', 'must', 'shall', 'should', 'need']):
754
- requirements.append(f"• {line.strip()}")
755
-
756
- if not requirements:
757
- return "• System shall provide core functionality\n• System shall integrate with existing tools\n• System shall meet performance standards"
758
-
759
- return '\n'.join(requirements)
760
 
761
- def ba_chat_response(message, history):
762
- """Generate BA consultant response"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
763
 
764
- ba_responses = {
765
- "hello": "Hello! I'm your Business Analyst assistant. I can help you create requirements documents, user stories, process flows, gap analyses, stakeholder analyses, and analyze datasets for ML recommendations. What would you like to work on today?",
766
- "help": "I can assist you with:\n• Business Requirements Documents\n• User Stories\n• Process Flow Documentation\n• Gap Analysis Reports\n• Stakeholder Analysis\n• Data Analysis with ML recommendations\n\nJust tell me what type of document you need and provide some details!",
767
- "requirements": "To create a Business Requirements Document, please provide:\n• Project name\n• Business objectives\n• Key requirements\n• Stakeholders involved\n\nThen select 'Requirements Document' from the dropdown above.",
768
- "user story": "For user stories, please provide:\n• User type (As a...)\n• What they want (I want...)\n• The benefit (So that...)\n• Priority level\n\nThen select 'User Story
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
6
+ from sklearn.linear_model import LinearRegression, LogisticRegression
7
+ from sklearn.svm import SVC, SVR
8
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
9
+ from sklearn.naive_bayes import GaussianNB
10
+ from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
11
+ from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
12
+ import matplotlib.pyplot as plt
13
+ import seaborn as sns
14
+ import io
15
+ import base64
16
+ import warnings
17
+ warnings.filterwarnings('ignore')
18
+
19
+ class BusinessAnalystGPT:
20
+ def __init__(self):
21
+ self.df = None
22
+ self.analysis_results = ""
23
+
24
+ def analyze_dataset(self, file):
25
+ """Analyze uploaded dataset and provide comprehensive insights"""
26
+ try:
27
+ # Read the dataset
28
+ if file.name.endswith('.csv'):
29
+ self.df = pd.read_csv(file.name)
30
+ elif file.name.endswith(('.xlsx', '.xls')):
31
+ self.df = pd.read_excel(file.name)
32
+ else:
33
+ return "Error: Please upload a CSV or Excel file."
34
+
35
+ # Basic dataset info
36
+ analysis = f"""
37
+ # 📊 DATASET ANALYSIS REPORT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ ## 📈 Basic Information
40
+ - **Dataset Shape**: {self.df.shape[0]} rows × {self.df.shape[1]} columns
41
+ - **Memory Usage**: {self.df.memory_usage(deep=True).sum() / 1024:.2f} KB
42
+ - **Missing Values**: {self.df.isnull().sum().sum()} total
43
 
44
+ ## 📋 Column Information
 
45
  """
46
+
47
+ # Column details
48
+ for i, col in enumerate(self.df.columns):
49
+ dtype = str(self.df[col].dtype)
50
+ missing = self.df[col].isnull().sum()
51
+ unique_vals = self.df[col].nunique()
52
+
53
+ analysis += f"\n**{i+1}. {col}**\n"
54
+ analysis += f" - Data Type: {dtype}\n"
55
+ analysis += f" - Missing Values: {missing} ({missing/len(self.df)*100:.1f}%)\n"
56
+ analysis += f" - Unique Values: {unique_vals}\n"
57
+
58
+ if dtype in ['int64', 'float64']:
59
+ analysis += f" - Range: {self.df[col].min():.2f} to {self.df[col].max():.2f}\n"
60
+ analysis += f" - Mean: {self.df[col].mean():.2f}\n"
61
+ elif dtype == 'object':
62
+ top_values = self.df[col].value_counts().head(3)
63
+ analysis += f" - Top Values: {list(top_values.index)}\n"
64
+
65
+ # Add ML Model Recommendations
66
+ analysis += self._get_ml_recommendations()
67
+
68
+ # Add Visualization Recommendations
69
+ analysis += self._get_visualization_recommendations()
70
+
71
+ self.analysis_results = analysis
72
+ return analysis
73
+
74
+ except Exception as e:
75
+ return f"Error analyzing dataset: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ def _get_ml_recommendations(self):
78
+ """Analyze dataset and recommend suitable ML models with variable suggestions"""
79
+ if self.df is None:
80
+ return ""
81
+
82
+ ml_analysis = "\n\n## 🤖 MACHINE LEARNING MODEL RECOMMENDATIONS\n\n"
83
+
84
+ # Identify variable types
85
+ numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
86
+ categorical_cols = self.df.select_dtypes(include=['object']).columns.tolist()
87
+
88
+ ml_analysis += "### 🎯 Potential Target Variables (Dependent Variables):\n"
89
+
90
+ # Suggest target variables based on data characteristics
91
+ target_suggestions = []
92
+
93
+ for col in numeric_cols:
94
+ unique_ratio = self.df[col].nunique() / len(self.df)
95
+ if unique_ratio < 0.1 and self.df[col].nunique() <= 10:
96
+ target_suggestions.append((col, "Classification", f"Has {self.df[col].nunique()} unique values - good for classification"))
97
+ elif unique_ratio > 0.1:
98
+ target_suggestions.append((col, "Regression", "Continuous values - suitable for regression"))
99
+
100
+ for col in categorical_cols:
101
+ if self.df[col].nunique() <= 10:
102
+ target_suggestions.append((col, "Classification", f"Categorical with {self.df[col].nunique()} classes"))
103
+
104
+ if target_suggestions:
105
+ for var, task_type, reason in target_suggestions:
106
+ ml_analysis += f"- **{var}** ({task_type}): {reason}\n"
107
  else:
108
+ ml_analysis += "- No clear target variables identified. Please specify based on your business objective.\n"
 
 
109
 
110
+ ml_analysis += "\n### 📊 Feature Variables (Independent Variables):\n"
111
 
112
+ # List potential feature variables
113
+ all_cols = list(self.df.columns)
114
+ if len(numeric_cols) > 0:
115
+ ml_analysis += f"- **Numeric Features**: {', '.join(numeric_cols)}\n"
116
+ if len(categorical_cols) > 0:
117
+ ml_analysis += f"- **Categorical Features**: {', '.join(categorical_cols)}\n"
118
+
119
+ # Model recommendations based on data characteristics
120
+ ml_analysis += "\n### 🔮 Recommended Models & Expected Performance:\n\n"
121
+
122
+ # Classification models
123
+ if any("Classification" in suggestion[1] for suggestion in target_suggestions):
124
+ ml_analysis += "#### 🎯 For Classification Tasks:\n"
125
+ ml_analysis += """
126
+ 1. **Random Forest Classifier** ⭐⭐⭐⭐⭐
127
+ - Expected Accuracy: 85-95%
128
+ - Best for: Mixed data types, feature importance
129
+ - Pros: Handles missing values, no overfitting
130
+
131
+ 2. **Logistic Regression** ⭐⭐⭐⭐
132
+ - Expected Accuracy: 75-85%
133
+ - Best for: Linear relationships, interpretability
134
+ - Pros: Fast, interpretable coefficients
135
+
136
+ 3. **Decision Tree** ⭐⭐⭐
137
+ - Expected Accuracy: 70-80%
138
+ - Best for: Rule-based decisions, interpretability
139
+ - Pros: Easy to understand and visualize
140
+
141
+ 4. **Support Vector Machine (SVM)** ⭐⭐⭐⭐
142
+ - Expected Accuracy: 80-90%
143
+ - Best for: High-dimensional data, small datasets
144
+ - Pros: Effective for complex patterns
145
+
146
+ 5. **K-Nearest Neighbors (KNN)** ⭐⭐⭐
147
+ - Expected Accuracy: 70-85%
148
+ - Best for: Simple patterns, small datasets
149
+ - Pros: Simple, no assumptions about data
150
+ """
151
+
152
+ # Regression models
153
+ if any("Regression" in suggestion[1] for suggestion in target_suggestions):
154
+ ml_analysis += "\n#### 📈 For Regression Tasks:\n"
155
+ ml_analysis += """
156
+ 1. **Random Forest Regressor** ⭐⭐⭐⭐⭐
157
+ - Expected R² Score: 0.80-0.95
158
+ - Best for: Non-linear relationships, feature importance
159
+ - Pros: Robust, handles outliers well
160
+
161
+ 2. **Linear Regression** ⭐⭐⭐⭐
162
+ - Expected R² Score: 0.70-0.85
163
+ - Best for: Linear relationships, interpretability
164
+ - Pros: Fast, interpretable, baseline model
165
+
166
+ 3. **Support Vector Regression (SVR)** ⭐⭐⭐⭐
167
+ - Expected R² Score: 0.75-0.90
168
+ - Best for: Non-linear patterns, robust predictions
169
+ - Pros: Effective for complex relationships
170
+
171
+ 4. **Decision Tree Regressor** ⭐⭐⭐
172
+ - Expected R² Score: 0.65-0.80
173
+ - Best for: Non-linear, interpretable rules
174
+ - Pros: Easy to understand decision path
175
+ """
176
+
177
+ # Data preprocessing recommendations
178
+ ml_analysis += "\n### 🛠️ Data Preprocessing Recommendations:\n"
179
+
180
+ missing_data = self.df.isnull().sum().sum()
181
+ if missing_data > 0:
182
+ ml_analysis += f"- **Handle Missing Data**: {missing_data} missing values need attention\n"
183
+
184
+ if len(categorical_cols) > 0:
185
+ ml_analysis += "- **Encode Categorical Variables**: Use Label Encoding or One-Hot Encoding\n"
186
+
187
+ if len(numeric_cols) > 1:
188
+ ml_analysis += "- **Feature Scaling**: Consider StandardScaler for SVM/KNN models\n"
189
+
190
+ outliers_detected = False
191
+ for col in numeric_cols:
192
+ Q1 = self.df[col].quantile(0.25)
193
+ Q3 = self.df[col].quantile(0.75)
194
+ IQR = Q3 - Q1
195
+ outliers = ((self.df[col] < (Q1 - 1.5 * IQR)) | (self.df[col] > (Q3 + 1.5 * IQR))).sum()
196
+ if outliers > len(self.df) * 0.05: # More than 5% outliers
197
+ outliers_detected = True
198
+ break
199
+
200
+ if outliers_detected:
201
+ ml_analysis += "- **Handle Outliers**: Detected outliers that may affect model performance\n"
202
+
203
+ return ml_analysis
204
 
205
+ def _get_visualization_recommendations(self):
206
+ """Provide specific chart recommendations for variables"""
207
+ if self.df is None:
208
+ return ""
209
+
210
+ viz_analysis = "\n\n## 📊 DATA VISUALIZATION RECOMMENDATIONS\n\n"
211
+
212
+ numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
213
+ categorical_cols = self.df.select_dtypes(include=['object']).columns.tolist()
214
+
215
+ # Single variable visualizations
216
+ viz_analysis += "### 📈 Single Variable Analysis:\n\n"
217
+
218
+ for col in numeric_cols:
219
+ viz_analysis += f"**{col}** (Numeric):\n"
220
+ viz_analysis += f"- **Histogram**: Show distribution of {col}\n"
221
+ viz_analysis += f"- **Box Plot**: Identify outliers in {col}\n"
222
+ viz_analysis += f"- **Density Plot**: Smooth distribution curve for {col}\n\n"
223
+
224
+ for col in categorical_cols:
225
+ unique_count = self.df[col].nunique()
226
+ viz_analysis += f"**{col}** (Categorical - {unique_count} categories):\n"
227
+ if unique_count <= 10:
228
+ viz_analysis += f"- **Bar Chart**: Count of each category in {col}\n"
229
+ viz_analysis += f"- **Pie Chart**: Proportion of categories in {col}\n"
230
  else:
231
+ viz_analysis += f"- **Bar Chart**: Top 10 categories in {col}\n"
232
+ viz_analysis += f"- **Donut Chart**: Alternative to pie chart for {col}\n\n"
233
+
234
+ # Two variable relationships
235
+ if len(self.df.columns) > 1:
236
+ viz_analysis += "### 🔗 Two Variable Relationships:\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
+ # Numeric vs Numeric
239
+ if len(numeric_cols) >= 2:
240
+ viz_analysis += "**Numeric vs Numeric Combinations:**\n"
241
+ for i in range(len(numeric_cols)):
242
+ for j in range(i+1, len(numeric_cols)):
243
+ col1, col2 = numeric_cols[i], numeric_cols[j]
244
+ viz_analysis += f"- **Scatter Plot**: {col1} (X-axis) vs {col2} (Y-axis)\n"
245
+ viz_analysis += f"- **Correlation Heatmap**: Relationship strength between {col1} and {col2}\n"
246
+ viz_analysis += "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
+ # Categorical vs Numeric
249
+ if len(categorical_cols) > 0 and len(numeric_cols) > 0:
250
+ viz_analysis += "**Categorical vs Numeric Combinations:**\n"
251
+ for cat_col in categorical_cols:
252
+ for num_col in numeric_cols:
253
+ viz_analysis += f"- **Box Plot**: {cat_col} (X-axis) vs {num_col} (Y-axis)\n"
254
+ viz_analysis += f"- **Violin Plot**: Distribution of {num_col} across {cat_col} categories\n"
255
+ viz_analysis += f"- **Bar Plot**: Average {num_col} by {cat_col}\n"
256
+ viz_analysis += "\n"
257
+
258
+ # Categorical vs Categorical
259
+ if len(categorical_cols) >= 2:
260
+ viz_analysis += "**Categorical vs Categorical Combinations:**\n"
261
+ for i in range(len(categorical_cols)):
262
+ for j in range(i+1, len(categorical_cols)):
263
+ col1, col2 = categorical_cols[i], categorical_cols[j]
264
+ viz_analysis += f"- **Stacked Bar Chart**: {col1} (X-axis) stacked by {col2}\n"
265
+ viz_analysis += f"- **Heatmap**: Cross-tabulation of {col1} vs {col2}\n"
266
+ viz_analysis += f"- **Grouped Bar Chart**: {col1} grouped by {col2}\n"
267
+ viz_analysis += "\n"
268
+
269
+ # Advanced visualizations
270
+ if len(self.df.columns) >= 3:
271
+ viz_analysis += "### 🎨 Advanced Multi-Variable Analysis:\n\n"
272
+
273
+ if len(numeric_cols) >= 3:
274
+ viz_analysis += "**For 3+ Numeric Variables:**\n"
275
+ viz_analysis += f"- **3D Scatter Plot**: {numeric_cols[0]} (X) vs {numeric_cols[1]} (Y) vs {numeric_cols[2]} (Z)\n"
276
+ viz_analysis += f"- **Pair Plot**: All numeric variables against each other\n"
277
+ viz_analysis += f"- **Correlation Matrix**: Heatmap of all numeric correlations\n\n"
278
+
279
+ if len(numeric_cols) >= 2 and len(categorical_cols) >= 1:
280
+ viz_analysis += "**For Mixed Variable Types:**\n"
281
+ viz_analysis += f"- **Scatter Plot with Color**: {numeric_cols[0]} vs {numeric_cols[1]} colored by {categorical_cols[0]}\n"
282
+ viz_analysis += f"- **Bubble Chart**: {numeric_cols[0]} (X) vs {numeric_cols[1]} (Y) with bubble size from another variable\n\n"
283
+
284
+ # Dashboard recommendations
285
+ viz_analysis += "### 📋 Dashboard Layout Suggestions:\n\n"
286
+ viz_analysis += "**Top Row**: Overview metrics and key KPIs\n"
287
+ viz_analysis += "**Middle Section**: Main analysis charts (2-3 key visualizations)\n"
288
+ viz_analysis += "**Bottom Section**: Detailed breakdowns and filters\n"
289
+ viz_analysis += "**Side Panel**: Interactive filters and controls\n"
290
+
291
+ return viz_analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
+ def generate_business_insights(self, question):
294
+ """Generate business insights based on the question and dataset"""
295
+ if self.df is None:
296
+ return "Please upload a dataset first to generate insights."
297
+
298
+ insights = f"""
299
+ # 💡 BUSINESS INSIGHTS & RECOMMENDATIONS
300
 
301
+ ## Question: {question}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
+ ## 📊 Data-Driven Analysis:
304
+ """
305
+
306
+ # Basic statistics
307
+ numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
308
+ categorical_cols = self.df.select_dtypes(include=['object']).columns.tolist()
309
+
310
+ if len(numeric_cols) > 0:
311
+ insights += "\n### 📈 Key Metrics:\n"
312
+ for col in numeric_cols[:5]: # Show top 5 numeric columns
313
+ mean_val = self.df[col].mean()
314
+ median_val = self.df[col].median()
315
+ std_val = self.df[col].std()
316
+ insights += f"- **{col}**: Mean = {mean_val:.2f}, Median = {median_val:.2f}, Std = {std_val:.2f}\n"
317
+
318
+ if len(categorical_cols) > 0:
319
+ insights += "\n### 📋 Category Distribution:\n"
320
+ for col in categorical_cols[:3]: # Show top 3 categorical columns
321
+ top_category = self.df[col].mode()[0]
322
+ category_count = self.df[col].value_counts().iloc[0]
323
+ total_count = len(self.df)
324
+ percentage = (category_count / total_count) * 100
325
+ insights += f"- **{col}**: Most common = '{top_category}' ({category_count}/{total_count} = {percentage:.1f}%)\n"
326
+
327
+ # Generate recommendations based on question keywords
328
+ question_lower = question.lower()
329
+
330
+ if any(word in question_lower for word in ['revenue', 'sales', 'profit', 'income']):
331
+ insights += "\n### 💰 Revenue/Sales Insights:\n"
332
+ insights += "- Focus on high-performing segments identified in the data\n"
333
+ insights += "- Analyze seasonal trends if time data is available\n"
334
+ insights += "- Consider customer segmentation based on purchase behavior\n"
335
+
336
+ elif any(word in question_lower for word in ['customer', 'client', 'user']):
337
+ insights += "\n### 👥 Customer Insights:\n"
338
+ insights += "- Segment customers based on key characteristics\n"
339
+ insights += "- Identify high-value customer profiles\n"
340
+ insights += "- Analyze customer retention patterns\n"
341
+
342
+ elif any(word in question_lower for word in ['marketing', 'campaign', 'advertising']):
343
+ insights += "\n### 📢 Marketing Insights:\n"
344
+ insights += "- Evaluate campaign performance metrics\n"
345
+ insights += "- Identify most effective channels\n"
346
+ insights += "- Optimize targeting based on demographic data\n"
347
+
348
+ elif any(word in question_lower for word in ['predict', 'forecast', 'future']):
349
+ insights += "\n### 🔮 Predictive Insights:\n"
350
+ insights += "- Use historical patterns for forecasting\n"
351
+ insights += "- Apply machine learning models for predictions\n"
352
+ insights += "- Consider external factors that might influence outcomes\n"
353
+
354
  else:
355
+ insights += "\n### 🎯 General Business Recommendations:\n"
356
+ insights += "- Identify key performance indicators from your data\n"
357
+ insights += "- Look for correlations between important variables\n"
358
+ insights += "- Consider segmentation strategies based on data patterns\n"
359
+
360
+ # Add data quality assessment
361
+ missing_data_pct = (self.df.isnull().sum().sum() / (self.df.shape[0] * self.df.shape[1])) * 100
362
+ insights += f"\n### ⚠️ Data Quality Notes:\n"
363
+ insights += f"- Missing data: {missing_data_pct:.1f}% of total data points\n"
364
+ insights += f"- Data completeness: {100-missing_data_pct:.1f}%\n"
365
+
366
+ if missing_data_pct > 10:
367
+ insights += "- **Recommendation**: Address missing data before making critical decisions\n"
368
+
369
+ return insights
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
 
371
+ def create_visualization(self, chart_type, x_column, y_column):
372
+ """Create visualizations based on user selection"""
373
+ if self.df is None:
374
+ return "Please upload a dataset first."
375
+
376
+ try:
377
+ plt.figure(figsize=(10, 6))
378
+ plt.style.use('default')
379
+
380
+ if chart_type == "Scatter Plot":
381
+ plt.scatter(self.df[x_column], self.df[y_column], alpha=0.6)
382
+ plt.xlabel(x_column)
383
+ plt.ylabel(y_column)
384
+ plt.title(f'Scatter Plot: {x_column} vs {y_column}')
385
+
386
+ elif chart_type == "Line Chart":
387
+ plt.plot(self.df[x_column], self.df[y_column])
388
+ plt.xlabel(x_column)
389
+ plt.ylabel(y_column)
390
+ plt.title(f'Line Chart: {x_column} vs {y_column}')
391
+
392
+ elif chart_type == "Bar Chart":
393
+ if self.df[x_column].dtype == 'object':
394
+ value_counts = self.df[x_column].value_counts().head(10)
395
+ plt.bar(value_counts.index, value_counts.values)
396
+ plt.xlabel(x_column)
397
+ plt.ylabel('Count')
398
+ plt.title(f'Bar Chart: {x_column}')
399
+ plt.xticks(rotation=45)
400
+ else:
401
+ plt.bar(self.df[x_column], self.df[y_column])
402
+ plt.xlabel(x_column)
403
+ plt.ylabel(y_column)
404
+ plt.title(f'Bar Chart: {x_column} vs {y_column}')
405
+
406
+ elif chart_type == "Histogram":
407
+ plt.hist(self.df[x_column], bins=30, alpha=0.7)
408
+ plt.xlabel(x_column)
409
+ plt.ylabel('Frequency')
410
+ plt.title(f'Histogram: {x_column}')
411
+
412
+ elif chart_type == "Box Plot":
413
+ if y_column and self.df[y_column].dtype == 'object':
414
+ self.df.boxplot(column=x_column, by=y_column)
415
+ plt.title(f'Box Plot: {x_column} by {y_column}')
416
+ else:
417
+ plt.boxplot(self.df[x_column].dropna())
418
+ plt.ylabel(x_column)
419
+ plt.title(f'Box Plot: {x_column}')
420
+
421
+ plt.tight_layout()
422
+
423
+ # Save plot to bytes
424
+ img_buffer = io.BytesIO()
425
+ plt.savefig(img_buffer, format='png', dpi=150, bbox_inches='tight')
426
+ img_buffer.seek(0)
427
+ plt.close()
428
+
429
+ return img_buffer.getvalue()
430
+
431
+ except Exception as e:
432
+ return f"Error creating visualization: {str(e)}"
433
 
434
+ # Initialize the Business Analyst GPT
435
+ analyst = BusinessAnalystGPT()
 
 
 
 
 
 
 
 
 
 
 
 
 
436
 
437
+ # Define the Gradio interface
438
+ def analyze_file(file):
439
+ return analyst.analyze_dataset(file)
 
 
 
 
 
 
 
 
 
440
 
441
+ def generate_insights(question):
442
+ return analyst.generate_business_insights(question)
 
 
 
 
 
 
 
 
 
 
443
 
444
+ def create_chart(chart_type, x_col, y_col):
445
+ result = analyst.create_visualization(chart_type, x_col, y_col)
446
+ if isinstance(result, bytes):
447
+ return result
448
+ else:
449
+ return result
450
+
451
+ def get_columns():
452
+ if analyst.df is not None:
453
+ return gr.update(choices=list(analyst.df.columns)), gr.update(choices=list(analyst.df.columns))
454
+ return gr.update(choices=[]), gr.update(choices=[])
455
+
456
+ # Create the Gradio interface
457
+ with gr.Blocks(title="Business Analyst GPT", theme=gr.themes.Soft()) as demo:
458
+ gr.Markdown("""
459
+ # 🤖 Business Analyst GPT
460
+ ### Your AI-Powered Data Analysis Assistant
461
+
462
+ Upload your dataset and get comprehensive business insights, ML model recommendations, and visualization suggestions!
463
+ """)
464
+
465
+ with gr.Tab("📊 Dataset Analysis"):
466
+ with gr.Row():
467
+ file_input = gr.File(label="Upload your dataset (CSV or Excel)", file_types=[".csv", ".xlsx", ".xls"])
468
+ analyze_btn = gr.Button("🔍 Analyze Dataset", variant="primary")
469
+
470
+ analysis_output = gr.Markdown(label="Analysis Results")
471
+ analyze_btn.click(analyze_file, inputs=[file_input], outputs=[analysis_output])
472
+
473
+ with gr.Tab("💡 Business Insights"):
474
+ with gr.Row():
475
+ question_input = gr.Textbox(
476
+ label="Ask a business question about your data",
477
+ placeholder="e.g., How can I increase revenue? What are the key customer segments?",
478
+ lines=2
479
+ )
480
+ insights_btn = gr.Button("💡 Generate Insights", variant="primary")
481
+
482
+ insights_output = gr.Markdown(label="Business Insights")
483
+ insights_btn.click(generate_insights, inputs=[question_input], outputs=[insights_output])
484
+
485
+ with gr.Tab("📈 Data Visualization"):
486
+ with gr.Row():
487
+ chart_type = gr.Dropdown(
488
+ choices=["Scatter Plot", "Line Chart", "Bar Chart", "Histogram", "Box Plot"],
489
+ label="Chart Type",
490
+ value="Scatter Plot"
491
+ )
492
+ refresh_cols = gr.Button("🔄 Refresh Columns")
493
+
494
+ with gr.Row():
495
+ x_column = gr.Dropdown(choices=[], label="X-axis Column")
496
+ y_column = gr.Dropdown(choices=[], label="Y-axis Column (optional for some charts)")
497
+
498
+ create_viz_btn = gr.Button("📊 Create Visualization", variant="primary")
499
+ viz_output = gr.Image(label="Visualization")
500
+
501
+ refresh_cols.click(get_columns, outputs=[x_column, y_column])
502
+ create_viz_btn.click(create_chart, inputs=[chart_type, x_column, y_column], outputs=[viz_output])
503
 
504
+ with gr.Tab("ℹ️ How to Use"):
505
+ gr.Markdown("""
506
+ ## 🚀 How to Use Business Analyst GPT
507
+
508
+ ### Step 1: Upload Your Dataset
509
+ - Click on "Dataset Analysis" tab
510
+ - Upload a CSV or Excel file containing your business data
511
+ - Click "Analyze Dataset" to get comprehensive insights
512
+
513
+ ### Step 2: Get ML Model Recommendations
514
+ After uploading, you'll receive:
515
+ - **Target Variable Suggestions**: Which columns can be predicted
516
+ - **Feature Variable Identification**: Which columns to use as predictors
517
+ - **Model Recommendations**: Best ML algorithms for your data
518
+ - **Expected Performance**: Accuracy estimates for each model
519
+
520
+ ### Step 3: Get Specific Visualization Ideas
521
+ The analysis will provide:
522
+ - **Single Variable Charts**: Best charts for each column
523
+ - **Two Variable Relationships**: Specific X-axis and Y-axis recommendations
524
+ - **Advanced Visualizations**: Multi-variable analysis suggestions
525
+ - **Dashboard Layout**: How to organize your charts
526
+
527
+ ### Step 4: Generate Business Insights
528
+ - Ask specific business questions about your data
529
+ - Get data-driven recommendations and insights
530
+ - Receive actionable business strategies
531
+
532
+ ### Step 5: Create Visualizations
533
+ - Choose from various chart types
534
+ - Select specific columns for X and Y axes
535
+ - Generate publication-ready charts
536
+
537
+ ## 📋 Supported File Types
538
+ - CSV files (.csv)
539
+ - Excel files (.xlsx, .xls)
540
+
541
+ ## 🎯 Best Practices
542
+ 1. **Clean Data**: Ensure your dataset has clear column headers
543
+ 2. **Relevant Questions**: Ask specific business questions for better insights
544
+ 3. **Column Selection**: Choose appropriate columns for visualizations
545
+ 4. **Data Size**: Larger datasets provide more reliable ML recommendations
546
+ """)
547
+
548
+ # Launch the app
549
+ if __name__ == "__main__":
550
+ demo.launch()