SunDou commited on
Commit
ec67b61
·
verified ·
1 Parent(s): f1c06ef

Upload data1/reporting/join_insights.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data1/reporting/join_insights.py +458 -0
data1/reporting/join_insights.py ADDED
@@ -0,0 +1,458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 关联分析:将repo-level指标与repos_searched元信息join
3
+ 生成关联分析图和分组对比图
4
+ """
5
+ import pandas as pd
6
+ import numpy as np
7
+ from pathlib import Path
8
+ import json
9
+ import matplotlib
10
+ matplotlib.use('Agg')
11
+ import matplotlib.pyplot as plt
12
+ import matplotlib.font_manager as fm
13
+ import seaborn as sns
14
+ import time
15
+
16
+ # Nature风格设置 - 使用字体回退机制(与visualization.py保持一致)
17
+ font_families_to_try = ['Arial', 'DejaVu Sans', 'Liberation Sans', 'sans-serif']
18
+ available_fonts = [f.name for f in fm.fontManager.ttflist]
19
+ font_found = None
20
+
21
+ for font_family in font_families_to_try:
22
+ font_lower = font_family.lower()
23
+ if any(f.lower() == font_lower for f in available_fonts):
24
+ font_found = font_family
25
+ break
26
+
27
+ if font_found is None:
28
+ font_found = 'sans-serif'
29
+
30
+ plt.rcParams['font.family'] = font_found
31
+ plt.rcParams['font.size'] = 20
32
+ plt.rcParams['axes.labelsize'] = 28 # Increased from 18
33
+ plt.rcParams['axes.titlesize'] = 28 # Increased from 20
34
+ plt.rcParams['xtick.labelsize'] = 24 # Increased from 15
35
+ plt.rcParams['ytick.labelsize'] = 24 # Increased from 15
36
+ plt.rcParams['legend.fontsize'] = 20 # Increased from 16
37
+ plt.rcParams['figure.titlesize'] = 32 # Increased from 22
38
+ plt.rcParams['axes.linewidth'] = 1.5
39
+ plt.rcParams['axes.spines.top'] = False
40
+ plt.rcParams['axes.spines.right'] = False
41
+ plt.rcParams['axes.grid'] = True
42
+ plt.rcParams['grid.alpha'] = 0.3
43
+ plt.rcParams['grid.linewidth'] = 0.5
44
+
45
+ # Nature配色
46
+ NATURE_COLORS = {
47
+ 'primary': '#2E5090',
48
+ 'secondary': '#1A5490',
49
+ 'accent': '#4A90E2',
50
+ 'success': '#2E7D32',
51
+ 'warning': '#F57C00',
52
+ 'error': '#C62828',
53
+ }
54
+
55
+ def apply_nature_style(ax):
56
+ """应用Nature风格"""
57
+ ax.spines['top'].set_visible(False)
58
+ ax.spines['right'].set_visible(False)
59
+ ax.spines['left'].set_linewidth(1.5)
60
+ ax.spines['bottom'].set_linewidth(1.5)
61
+ ax.grid(True, alpha=0.3, linestyle='--', linewidth=0.5)
62
+ ax.tick_params(width=1.5, length=5)
63
+
64
+
65
+ class JoinInsights:
66
+ def __init__(self, repos_searched_csv, repo_level_csv, check_history_csv, output_dir):
67
+ self.repos_searched_csv = repos_searched_csv
68
+ self.repo_level_csv = repo_level_csv
69
+ self.check_history_csv = check_history_csv
70
+ self.output_dir = Path(output_dir)
71
+ self.output_dir.mkdir(parents=True, exist_ok=True)
72
+
73
+ self.df_joined = None
74
+
75
+ def load_and_join(self):
76
+ """加载数据并join"""
77
+ print("Loading data...")
78
+
79
+ # 读取repo-level统计
80
+ df_repo = pd.read_csv(self.repo_level_csv)
81
+ df_repo['full_name'] = df_repo['full_name'].fillna(
82
+ df_repo['repo_name'].str.replace('___', '/')
83
+ )
84
+
85
+ # 读取repos_searched(只读取需要的列以节省内存)
86
+ print("Loading repos_searched.csv...")
87
+ df_searched = pd.read_csv(
88
+ self.repos_searched_csv,
89
+ usecols=['full_name', 'keyword', 'stars', 'forks', 'open_issues',
90
+ 'created_at', 'pushed_at', 'language', 'license', 'archived'],
91
+ dtype={'stars': 'float64', 'forks': 'float64', 'open_issues': 'float64'}
92
+ )
93
+
94
+ # 读取check_history(获取is_relevant)
95
+ print("Loading repos_check_history.csv...")
96
+ df_history = pd.read_csv(
97
+ self.check_history_csv,
98
+ usecols=['full_name', 'keyword', 'is_relevant']
99
+ )
100
+
101
+ # Join: 先join check_history获取is_relevant,再join searched获取元信息
102
+ print("Joining data...")
103
+ df_joined = df_repo.merge(df_history, on='full_name', how='left')
104
+ df_joined = df_joined.merge(df_searched, on='full_name', how='left', suffixes=('', '_searched'))
105
+
106
+ # 处理重复列
107
+ if 'keyword_searched' in df_joined.columns:
108
+ df_joined['keyword'] = df_joined['keyword'].fillna(df_joined['keyword_searched'])
109
+ if 'language_searched' in df_joined.columns:
110
+ df_joined['language_searched'] = df_joined['language_searched'].fillna(df_joined.get('primary_language', ''))
111
+
112
+ # 清理
113
+ df_joined = df_joined.dropna(subset=['full_name'])
114
+
115
+ self.df_joined = df_joined
116
+ print(f"Joined data: {len(df_joined)} rows")
117
+
118
+ # 保存join后的数据
119
+ df_joined.to_csv(self.output_dir / 'joined_data.csv', index=False)
120
+ print(f"Saved joined data to {self.output_dir / 'joined_data.csv'}")
121
+
122
+ def analyze_correlations(self):
123
+ """分析关联性"""
124
+ if self.df_joined is None:
125
+ self.load_and_join()
126
+
127
+ df = self.df_joined.copy()
128
+
129
+ # 数值列相关性分析
130
+ numeric_cols = ['stars', 'forks', 'open_issues', 'total_code_lines',
131
+ 'total_tokens', 'total_functions', 'total_files',
132
+ 'comment_ratio', 'language_entropy']
133
+ numeric_cols = [c for c in numeric_cols if c in df.columns]
134
+
135
+ df_numeric = df[numeric_cols].dropna()
136
+
137
+ if len(df_numeric) > 0:
138
+ corr_matrix = df_numeric.corr()
139
+
140
+ # 保存相关性矩阵
141
+ corr_matrix.to_csv(self.output_dir / 'correlation_matrix.csv')
142
+
143
+ # 重点相关性
144
+ insights = {}
145
+
146
+ if 'stars' in df_numeric.columns and 'total_code_lines' in df_numeric.columns:
147
+ corr = df_numeric['stars'].corr(df_numeric['total_code_lines'])
148
+ insights['stars_vs_loc'] = float(corr)
149
+
150
+ if 'stars' in df_numeric.columns and 'total_functions' in df_numeric.columns:
151
+ corr = df_numeric['stars'].corr(df_numeric['total_functions'])
152
+ insights['stars_vs_functions'] = float(corr)
153
+
154
+ if 'stars' in df_numeric.columns and 'comment_ratio' in df_numeric.columns:
155
+ corr = df_numeric['stars'].corr(df_numeric['comment_ratio'])
156
+ insights['stars_vs_comment_ratio'] = float(corr)
157
+
158
+ with open(self.output_dir / 'correlation_insights.json', 'w', encoding='utf-8') as f:
159
+ json.dump(insights, f, indent=2)
160
+
161
+ print(f"Correlation insights saved")
162
+
163
+ def plot_stars_vs_metrics(self):
164
+ """绘制stars与多个指标的关系"""
165
+ if self.df_joined is None:
166
+ self.load_and_join()
167
+
168
+ df = self.df_joined.copy()
169
+ df = df[df['stars'].notna() & (df['stars'] > 0)]
170
+
171
+ if len(df) == 0:
172
+ print("No data for stars vs metrics plot")
173
+ return
174
+
175
+ fig, axes = plt.subplots(2, 2, figsize=(19.2, 10.8))
176
+
177
+ colors_list = [NATURE_COLORS['primary'], NATURE_COLORS['accent'],
178
+ NATURE_COLORS['success'], NATURE_COLORS['secondary']]
179
+
180
+ # 1. stars vs total_code_lines
181
+ ax = axes[0, 0]
182
+ apply_nature_style(ax)
183
+ df_plot = df[df['total_code_lines'] > 0]
184
+ if len(df_plot) > 0:
185
+ ax.scatter(df_plot['total_code_lines'], df_plot['stars'],
186
+ alpha=0.4, s=30, color=colors_list[0], edgecolors='white', linewidth=0.5)
187
+ ax.set_xscale('log')
188
+ ax.set_yscale('log')
189
+ ax.set_xlabel('Lines of Code (LOC, log scale)', fontsize=28, fontweight='bold')
190
+ ax.set_ylabel('Stars (log scale)', fontsize=28, fontweight='bold')
191
+ ax.set_title('Stars vs Lines of Code', fontsize=28, fontweight='bold')
192
+
193
+ corr = np.corrcoef(np.log10(df_plot['total_code_lines']),
194
+ np.log10(df_plot['stars']))[0, 1]
195
+ ax.text(0.05, 0.95, f'r = {corr:.3f}', transform=ax.transAxes,
196
+ fontsize=24, fontweight='bold', verticalalignment='top',
197
+ bbox=dict(boxstyle='round', facecolor='white', alpha=0.8,
198
+ edgecolor=NATURE_COLORS['primary'], linewidth=2))
199
+
200
+ # 2. stars vs total_functions
201
+ ax = axes[0, 1]
202
+ apply_nature_style(ax)
203
+ df_plot = df[df['total_functions'] > 0]
204
+ if len(df_plot) > 0:
205
+ ax.scatter(df_plot['total_functions'], df_plot['stars'],
206
+ alpha=0.4, s=30, color=colors_list[1], edgecolors='white', linewidth=0.5)
207
+ ax.set_xscale('log')
208
+ ax.set_yscale('log')
209
+ ax.set_xlabel('Number of Functions (log scale)', fontsize=28, fontweight='bold')
210
+ ax.set_ylabel('Stars (log scale)', fontsize=28, fontweight='bold')
211
+ ax.set_title('Stars vs Number of Functions', fontsize=28, fontweight='bold')
212
+
213
+ corr = np.corrcoef(np.log10(df_plot['total_functions']),
214
+ np.log10(df_plot['stars']))[0, 1]
215
+ ax.text(0.05, 0.95, f'r = {corr:.3f}', transform=ax.transAxes,
216
+ fontsize=18, fontweight='bold', verticalalignment='top',
217
+ bbox=dict(boxstyle='round', facecolor='white', alpha=0.8,
218
+ edgecolor=NATURE_COLORS['accent'], linewidth=2))
219
+
220
+ # 3. stars vs comment_ratio
221
+ ax = axes[1, 0]
222
+ apply_nature_style(ax)
223
+ df_plot = df[df['comment_ratio'].notna() & (df['comment_ratio'] >= 0)]
224
+ if len(df_plot) > 0:
225
+ ax.scatter(df_plot['comment_ratio'], df_plot['stars'],
226
+ alpha=0.4, s=30, color=colors_list[2], edgecolors='white', linewidth=0.5)
227
+ ax.set_yscale('log')
228
+ ax.set_xlabel('Comment Ratio', fontsize=28, fontweight='bold')
229
+ ax.set_ylabel('Stars (log scale)', fontsize=28, fontweight='bold')
230
+ ax.set_title('Stars vs Comment Ratio', fontsize=28, fontweight='bold')
231
+
232
+ corr = df_plot['comment_ratio'].corr(np.log10(df_plot['stars']))
233
+ ax.text(0.05, 0.95, f'r = {corr:.3f}', transform=ax.transAxes,
234
+ fontsize=18, fontweight='bold', verticalalignment='top',
235
+ bbox=dict(boxstyle='round', facecolor='white', alpha=0.8,
236
+ edgecolor=NATURE_COLORS['success'], linewidth=2))
237
+
238
+ # 4. stars vs language_entropy
239
+ ax = axes[1, 1]
240
+ apply_nature_style(ax)
241
+ df_plot = df[df['language_entropy'].notna() & (df['language_entropy'] >= 0)]
242
+ if len(df_plot) > 0:
243
+ ax.scatter(df_plot['language_entropy'], df_plot['stars'],
244
+ alpha=0.4, s=30, color=colors_list[3], edgecolors='white', linewidth=0.5)
245
+ ax.set_yscale('log')
246
+ ax.set_xlabel('Language Diversity (Entropy)', fontsize=28, fontweight='bold')
247
+ ax.set_ylabel('Stars (log scale)', fontsize=28, fontweight='bold')
248
+ ax.set_title('Stars vs Language Diversity', fontsize=28, fontweight='bold')
249
+
250
+ corr = df_plot['language_entropy'].corr(np.log10(df_plot['stars']))
251
+ ax.text(0.05, 0.95, f'r = {corr:.3f}', transform=ax.transAxes,
252
+ fontsize=18, fontweight='bold', verticalalignment='top',
253
+ bbox=dict(boxstyle='round', facecolor='white', alpha=0.8,
254
+ edgecolor=NATURE_COLORS['secondary'], linewidth=2))
255
+
256
+ plt.suptitle('Correlation Analysis: Stars vs Code Metrics (Top 15K Repositories)',
257
+ fontsize=32, fontweight='bold', y=0.995)
258
+ plt.tight_layout(rect=[0, 0, 1, 0.96])
259
+
260
+ fig_path = self.output_dir / 'fig_insights_stars_vs_metrics.png'
261
+ plt.savefig(fig_path, dpi=150, bbox_inches='tight', facecolor='white')
262
+ plt.close()
263
+ print(f"Saved: {fig_path}")
264
+
265
+ def plot_by_keyword_comparison(self):
266
+ """按keyword分组对比代码指标"""
267
+ if self.df_joined is None:
268
+ self.load_and_join()
269
+
270
+ df = self.df_joined.copy()
271
+ df = df[df['keyword'].notna()]
272
+
273
+ # Top keywords (increased to 15 for better comparison)
274
+ top_keywords = df['keyword'].value_counts().head(15).index
275
+ df = df[df['keyword'].isin(top_keywords)]
276
+
277
+ if len(df) == 0:
278
+ print("No data for keyword comparison")
279
+ return
280
+
281
+ fig, axes = plt.subplots(2, 2, figsize=(19.2, 10.8))
282
+
283
+ colors_list = [NATURE_COLORS['primary'], NATURE_COLORS['success'],
284
+ NATURE_COLORS['warning'], NATURE_COLORS['secondary']]
285
+
286
+ # 1. 平均代码行数
287
+ ax = axes[0, 0]
288
+ apply_nature_style(ax)
289
+ stats = df.groupby('keyword')['total_code_lines'].mean().sort_values(ascending=False)
290
+ stats.plot(kind='bar', ax=ax, color=colors_list[0], alpha=0.85, edgecolor='white', linewidth=1.5)
291
+ ax.set_title('Average Lines of Code', fontsize=28, fontweight='bold')
292
+ ax.set_xlabel('')
293
+ ax.set_ylabel('Average LOC', fontsize=28)
294
+ ax.tick_params(axis='x', rotation=45, labelsize=24) # Increased font size
295
+ ax.tick_params(axis='y', labelsize=24)
296
+
297
+ # 2. 平均注释率
298
+ ax = axes[0, 1]
299
+ apply_nature_style(ax)
300
+ stats = df.groupby('keyword')['comment_ratio'].mean().sort_values(ascending=False)
301
+ stats.plot(kind='bar', ax=ax, color=colors_list[1], alpha=0.85, edgecolor='white', linewidth=1.5)
302
+ ax.set_title('Average Comment Ratio', fontsize=28, fontweight='bold')
303
+ ax.set_xlabel('')
304
+ ax.set_ylabel('Comment Ratio', fontsize=28)
305
+ ax.tick_params(axis='x', rotation=45, labelsize=24) # Increased font size
306
+ ax.tick_params(axis='y', labelsize=24)
307
+
308
+ # 3. 平均stars(如果有)
309
+ ax = axes[1, 0]
310
+ apply_nature_style(ax)
311
+ if 'stars' in df.columns:
312
+ stats = df.groupby('keyword')['stars'].mean().sort_values(ascending=False)
313
+ stats.plot(kind='bar', ax=ax, color=colors_list[2], alpha=0.85, edgecolor='white', linewidth=1.5)
314
+ ax.set_title('Average Stars', fontsize=28, fontweight='bold')
315
+ ax.set_xlabel('')
316
+ ax.set_ylabel('Average Stars', fontsize=28)
317
+ ax.tick_params(axis='x', rotation=45, labelsize=24) # Increased font size
318
+ ax.tick_params(axis='y', labelsize=24)
319
+
320
+ # 4. 语言多样性
321
+ ax = axes[1, 1]
322
+ apply_nature_style(ax)
323
+ stats = df.groupby('keyword')['language_entropy'].mean().sort_values(ascending=False)
324
+ stats.plot(kind='bar', ax=ax, color=colors_list[3], alpha=0.85, edgecolor='white', linewidth=1.5)
325
+ ax.set_title('Average Language Diversity', fontsize=28, fontweight='bold')
326
+ ax.set_xlabel('')
327
+ ax.set_ylabel('Language Entropy', fontsize=28)
328
+ ax.tick_params(axis='x', rotation=45, labelsize=24) # Increased font size
329
+ ax.tick_params(axis='y', labelsize=24)
330
+
331
+ plt.suptitle('Code Metrics Comparison by Keyword (Top 15K Repositories)',
332
+ fontsize=32, fontweight='bold', y=0.995)
333
+ plt.tight_layout(rect=[0, 0, 1, 0.96])
334
+
335
+ fig_path = self.output_dir / 'fig_insights_by_keyword.png'
336
+ plt.savefig(fig_path, dpi=150, bbox_inches='tight', facecolor='white')
337
+ plt.close()
338
+ print(f"Saved: {fig_path}")
339
+
340
+ def plot_archived_vs_active(self):
341
+ """对比archived与active仓库的代码特征"""
342
+ if self.df_joined is None:
343
+ self.load_and_join()
344
+
345
+ df = self.df_joined.copy()
346
+
347
+ if 'archived' not in df.columns:
348
+ print("No archived column in data")
349
+ return
350
+
351
+ df['is_archived'] = df['archived'].fillna(False)
352
+
353
+ fig, axes = plt.subplots(2, 2, figsize=(19.2, 10.8))
354
+
355
+ # 1. 代码行数对比
356
+ ax = axes[0, 0]
357
+ apply_nature_style(ax)
358
+ df_plot = df[df['total_code_lines'] > 0]
359
+ if len(df_plot) > 0:
360
+ bp = df_plot.boxplot(column='total_code_lines', by='is_archived', ax=ax,
361
+ widths=0.6, patch_artist=True,
362
+ boxprops=dict(facecolor=NATURE_COLORS['primary'], alpha=0.7, linewidth=2),
363
+ medianprops=dict(color='white', linewidth=3),
364
+ whiskerprops=dict(linewidth=2),
365
+ capprops=dict(linewidth=2))
366
+ ax.set_title('Lines of Code: Archived vs Active', fontsize=28, fontweight='bold')
367
+ ax.set_xlabel('')
368
+ ax.set_ylabel('Lines of Code', fontsize=28)
369
+ ax.set_yscale('log')
370
+ ax.set_xticklabels(['Active', 'Archived'], fontsize=24)
371
+ plt.setp(ax.xaxis.get_majorticklabels(), rotation=0)
372
+
373
+ # 2. 注释率对比
374
+ ax = axes[0, 1]
375
+ apply_nature_style(ax)
376
+ df_plot = df[df['comment_ratio'].notna()]
377
+ if len(df_plot) > 0:
378
+ bp = df_plot.boxplot(column='comment_ratio', by='is_archived', ax=ax,
379
+ widths=0.6, patch_artist=True,
380
+ boxprops=dict(facecolor=NATURE_COLORS['success'], alpha=0.7, linewidth=2),
381
+ medianprops=dict(color='white', linewidth=3),
382
+ whiskerprops=dict(linewidth=2),
383
+ capprops=dict(linewidth=2))
384
+ ax.set_title('Comment Ratio: Archived vs Active', fontsize=28, fontweight='bold')
385
+ ax.set_xlabel('')
386
+ ax.set_ylabel('Comment Ratio', fontsize=28)
387
+ ax.set_xticklabels(['Active', 'Archived'], fontsize=24)
388
+ plt.setp(ax.xaxis.get_majorticklabels(), rotation=0)
389
+
390
+ # 3. 函数数对比
391
+ ax = axes[1, 0]
392
+ apply_nature_style(ax)
393
+ df_plot = df[df['total_functions'] > 0]
394
+ if len(df_plot) > 0:
395
+ bp = df_plot.boxplot(column='total_functions', by='is_archived', ax=ax,
396
+ widths=0.6, patch_artist=True,
397
+ boxprops=dict(facecolor=NATURE_COLORS['accent'], alpha=0.7, linewidth=2),
398
+ medianprops=dict(color='white', linewidth=3),
399
+ whiskerprops=dict(linewidth=2),
400
+ capprops=dict(linewidth=2))
401
+ ax.set_title('Number of Functions: Archived vs Active', fontsize=28, fontweight='bold')
402
+ ax.set_xlabel('')
403
+ ax.set_ylabel('Number of Functions', fontsize=28)
404
+ ax.set_yscale('log')
405
+ ax.set_xticklabels(['Active', 'Archived'], fontsize=24)
406
+ plt.setp(ax.xaxis.get_majorticklabels(), rotation=0)
407
+
408
+ # 4. 文件数对比
409
+ ax = axes[1, 1]
410
+ apply_nature_style(ax)
411
+ df_plot = df[df['total_files'] > 0]
412
+ if len(df_plot) > 0:
413
+ bp = df_plot.boxplot(column='total_files', by='is_archived', ax=ax,
414
+ widths=0.6, patch_artist=True,
415
+ boxprops=dict(facecolor=NATURE_COLORS['secondary'], alpha=0.7, linewidth=2),
416
+ medianprops=dict(color='white', linewidth=3),
417
+ whiskerprops=dict(linewidth=2),
418
+ capprops=dict(linewidth=2))
419
+ ax.set_title('Number of Files: Archived vs Active', fontsize=28, fontweight='bold')
420
+ ax.set_xlabel('')
421
+ ax.set_ylabel('Number of Files', fontsize=28)
422
+ ax.set_yscale('log')
423
+ ax.set_xticklabels(['Active', 'Archived'], fontsize=24)
424
+ plt.setp(ax.xaxis.get_majorticklabels(), rotation=0)
425
+
426
+ plt.suptitle('Code Characteristics Comparison: Archived vs Active (Top 15K Repositories)',
427
+ fontsize=32, fontweight='bold', y=0.995)
428
+ plt.tight_layout(rect=[0, 0, 1, 0.96])
429
+
430
+ fig_path = self.output_dir / 'fig_insights_archived_vs_active.png'
431
+ plt.savefig(fig_path, dpi=150, bbox_inches='tight', facecolor='white')
432
+ plt.close()
433
+ print(f"Saved: {fig_path}")
434
+
435
+ def run(self):
436
+ """执行完整分析"""
437
+ print("=" * 80)
438
+ print("关联分析与洞察")
439
+ print("=" * 80)
440
+
441
+ self.load_and_join()
442
+ self.analyze_correlations()
443
+ self.plot_stars_vs_metrics()
444
+ self.plot_by_keyword_comparison()
445
+ self.plot_archived_vs_active()
446
+
447
+ print(f"\n关联分析完成!结果保存在: {self.output_dir}")
448
+
449
+
450
+ if __name__ == "__main__":
451
+ repos_searched_csv = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_searched.csv"
452
+ repo_level_csv = "/home/weifengsun/tangou1/domain_code/src/workdir/reporting/code_stats/repo_level_metrics_top15000.csv"
453
+ check_history_csv = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_check_history.csv"
454
+ output_dir = "/home/weifengsun/tangou1/domain_code/src/workdir/reporting/insights"
455
+
456
+ insights = JoinInsights(repos_searched_csv, repo_level_csv, check_history_csv, output_dir)
457
+ insights.run()
458
+