SunDou
/

dataset-builder

Model card Files Files and versions

xet

Community

SunDou commited on 4 days ago

Commit

ec67b61

verified ·

1 Parent(s): f1c06ef

Upload data1/reporting/join_insights.py with huggingface_hub

Browse files

Files changed (1) hide show

data1/reporting/join_insights.py +458 -0

data1/reporting/join_insights.py ADDED Viewed

	@@ -0,0 +1,458 @@

+"""
+关联分析：将repo-level指标与repos_searched元信息join
+生成关联分析图和分组对比图
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import json
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import matplotlib.font_manager as fm
+import seaborn as sns
+import time
+# Nature风格设置 - 使用字体回退机制（与visualization.py保持一致）
+font_families_to_try = ['Arial', 'DejaVu Sans', 'Liberation Sans', 'sans-serif']
+available_fonts = [f.name for f in fm.fontManager.ttflist]
+font_found = None
+for font_family in font_families_to_try:
+    font_lower = font_family.lower()
+    if any(f.lower() == font_lower for f in available_fonts):
+        font_found = font_family
+        break
+if font_found is None:
+    font_found = 'sans-serif'
+plt.rcParams['font.family'] = font_found
+plt.rcParams['font.size'] = 20
+plt.rcParams['axes.labelsize'] = 28  # Increased from 18
+plt.rcParams['axes.titlesize'] = 28  # Increased from 20
+plt.rcParams['xtick.labelsize'] = 24  # Increased from 15
+plt.rcParams['ytick.labelsize'] = 24  # Increased from 15
+plt.rcParams['legend.fontsize'] = 20  # Increased from 16
+plt.rcParams['figure.titlesize'] = 32  # Increased from 22
+plt.rcParams['axes.linewidth'] = 1.5
+plt.rcParams['axes.spines.top'] = False
+plt.rcParams['axes.spines.right'] = False
+plt.rcParams['axes.grid'] = True
+plt.rcParams['grid.alpha'] = 0.3
+plt.rcParams['grid.linewidth'] = 0.5
+# Nature配色
+NATURE_COLORS = {
+    'primary': '#2E5090',
+    'secondary': '#1A5490',
+    'accent': '#4A90E2',
+    'success': '#2E7D32',
+    'warning': '#F57C00',
+    'error': '#C62828',
+}
+def apply_nature_style(ax):
+    """应用Nature风格"""
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['left'].set_linewidth(1.5)
+    ax.spines['bottom'].set_linewidth(1.5)
+    ax.grid(True, alpha=0.3, linestyle='--', linewidth=0.5)
+    ax.tick_params(width=1.5, length=5)
+class JoinInsights:
+    def __init__(self, repos_searched_csv, repo_level_csv, check_history_csv, output_dir):
+        self.repos_searched_csv = repos_searched_csv
+        self.repo_level_csv = repo_level_csv
+        self.check_history_csv = check_history_csv
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.df_joined = None
+    def load_and_join(self):
+        """加载数据并join"""
+        print("Loading data...")
+        # 读取repo-level统计
+        df_repo = pd.read_csv(self.repo_level_csv)
+        df_repo['full_name'] = df_repo['full_name'].fillna(
+            df_repo['repo_name'].str.replace('___', '/')
+        )
+        # 读取repos_searched（只读取需要的列以节省内存）
+        print("Loading repos_searched.csv...")
+        df_searched = pd.read_csv(
+            self.repos_searched_csv,
+            usecols=['full_name', 'keyword', 'stars', 'forks', 'open_issues',
+                    'created_at', 'pushed_at', 'language', 'license', 'archived'],
+            dtype={'stars': 'float64', 'forks': 'float64', 'open_issues': 'float64'}
+        )
+        # 读取check_history（获取is_relevant）
+        print("Loading repos_check_history.csv...")
+        df_history = pd.read_csv(
+            self.check_history_csv,
+            usecols=['full_name', 'keyword', 'is_relevant']
+        )
+        # Join: 先join check_history获取is_relevant，再join searched获取元信息
+        print("Joining data...")
+        df_joined = df_repo.merge(df_history, on='full_name', how='left')
+        df_joined = df_joined.merge(df_searched, on='full_name', how='left', suffixes=('', '_searched'))
+        # 处理重复列
+        if 'keyword_searched' in df_joined.columns:
+            df_joined['keyword'] = df_joined['keyword'].fillna(df_joined['keyword_searched'])
+        if 'language_searched' in df_joined.columns:
+            df_joined['language_searched'] = df_joined['language_searched'].fillna(df_joined.get('primary_language', ''))
+        # 清理
+        df_joined = df_joined.dropna(subset=['full_name'])
+        self.df_joined = df_joined
+        print(f"Joined data: {len(df_joined)} rows")
+        # 保存join后的数据
+        df_joined.to_csv(self.output_dir / 'joined_data.csv', index=False)
+        print(f"Saved joined data to {self.output_dir / 'joined_data.csv'}")
+    def analyze_correlations(self):
+        """分析关联性"""
+        if self.df_joined is None:
+            self.load_and_join()
+        df = self.df_joined.copy()
+        # 数值列相关性分析
+        numeric_cols = ['stars', 'forks', 'open_issues', 'total_code_lines',
+                       'total_tokens', 'total_functions', 'total_files',
+                       'comment_ratio', 'language_entropy']
+        numeric_cols = [c for c in numeric_cols if c in df.columns]
+        df_numeric = df[numeric_cols].dropna()
+        if len(df_numeric) > 0:
+            corr_matrix = df_numeric.corr()
+            # 保存相关性矩阵
+            corr_matrix.to_csv(self.output_dir / 'correlation_matrix.csv')
+            # 重点相关性
+            insights = {}
+            if 'stars' in df_numeric.columns and 'total_code_lines' in df_numeric.columns:
+                corr = df_numeric['stars'].corr(df_numeric['total_code_lines'])
+                insights['stars_vs_loc'] = float(corr)
+            if 'stars' in df_numeric.columns and 'total_functions' in df_numeric.columns:
+                corr = df_numeric['stars'].corr(df_numeric['total_functions'])
+                insights['stars_vs_functions'] = float(corr)
+            if 'stars' in df_numeric.columns and 'comment_ratio' in df_numeric.columns:
+                corr = df_numeric['stars'].corr(df_numeric['comment_ratio'])
+                insights['stars_vs_comment_ratio'] = float(corr)
+            with open(self.output_dir / 'correlation_insights.json', 'w', encoding='utf-8') as f:
+                json.dump(insights, f, indent=2)
+            print(f"Correlation insights saved")
+    def plot_stars_vs_metrics(self):
+        """绘制stars与多个指标的关系"""
+        if self.df_joined is None:
+            self.load_and_join()
+        df = self.df_joined.copy()
+        df = df[df['stars'].notna() & (df['stars'] > 0)]
+        if len(df) == 0:
+            print("No data for stars vs metrics plot")
+            return
+        fig, axes = plt.subplots(2, 2, figsize=(19.2, 10.8))
+        colors_list = [NATURE_COLORS['primary'], NATURE_COLORS['accent'],
+                      NATURE_COLORS['success'], NATURE_COLORS['secondary']]
+        # 1. stars vs total_code_lines
+        ax = axes[0, 0]
+        apply_nature_style(ax)
+        df_plot = df[df['total_code_lines'] > 0]
+        if len(df_plot) > 0:
+            ax.scatter(df_plot['total_code_lines'], df_plot['stars'],
+                      alpha=0.4, s=30, color=colors_list[0], edgecolors='white', linewidth=0.5)
+            ax.set_xscale('log')
+            ax.set_yscale('log')
+            ax.set_xlabel('Lines of Code (LOC, log scale)', fontsize=28, fontweight='bold')
+            ax.set_ylabel('Stars (log scale)', fontsize=28, fontweight='bold')
+            ax.set_title('Stars vs Lines of Code', fontsize=28, fontweight='bold')
+            corr = np.corrcoef(np.log10(df_plot['total_code_lines']),
+                              np.log10(df_plot['stars']))[0, 1]
+            ax.text(0.05, 0.95, f'r = {corr:.3f}', transform=ax.transAxes,
+                   fontsize=24, fontweight='bold', verticalalignment='top',
+                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8,
+                           edgecolor=NATURE_COLORS['primary'], linewidth=2))
+        # 2. stars vs total_functions
+        ax = axes[0, 1]
+        apply_nature_style(ax)
+        df_plot = df[df['total_functions'] > 0]
+        if len(df_plot) > 0:
+            ax.scatter(df_plot['total_functions'], df_plot['stars'],
+                      alpha=0.4, s=30, color=colors_list[1], edgecolors='white', linewidth=0.5)
+            ax.set_xscale('log')
+            ax.set_yscale('log')
+            ax.set_xlabel('Number of Functions (log scale)', fontsize=28, fontweight='bold')
+            ax.set_ylabel('Stars (log scale)', fontsize=28, fontweight='bold')
+            ax.set_title('Stars vs Number of Functions', fontsize=28, fontweight='bold')
+            corr = np.corrcoef(np.log10(df_plot['total_functions']),
+                              np.log10(df_plot['stars']))[0, 1]
+            ax.text(0.05, 0.95, f'r = {corr:.3f}', transform=ax.transAxes,
+                   fontsize=18, fontweight='bold', verticalalignment='top',
+                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8,
+                           edgecolor=NATURE_COLORS['accent'], linewidth=2))
+        # 3. stars vs comment_ratio
+        ax = axes[1, 0]
+        apply_nature_style(ax)
+        df_plot = df[df['comment_ratio'].notna() & (df['comment_ratio'] >= 0)]
+        if len(df_plot) > 0:
+            ax.scatter(df_plot['comment_ratio'], df_plot['stars'],
+                      alpha=0.4, s=30, color=colors_list[2], edgecolors='white', linewidth=0.5)
+            ax.set_yscale('log')
+            ax.set_xlabel('Comment Ratio', fontsize=28, fontweight='bold')
+            ax.set_ylabel('Stars (log scale)', fontsize=28, fontweight='bold')
+            ax.set_title('Stars vs Comment Ratio', fontsize=28, fontweight='bold')
+            corr = df_plot['comment_ratio'].corr(np.log10(df_plot['stars']))
+            ax.text(0.05, 0.95, f'r = {corr:.3f}', transform=ax.transAxes,
+                   fontsize=18, fontweight='bold', verticalalignment='top',
+                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8,
+                           edgecolor=NATURE_COLORS['success'], linewidth=2))
+        # 4. stars vs language_entropy
+        ax = axes[1, 1]
+        apply_nature_style(ax)
+        df_plot = df[df['language_entropy'].notna() & (df['language_entropy'] >= 0)]
+        if len(df_plot) > 0:
+            ax.scatter(df_plot['language_entropy'], df_plot['stars'],
+                      alpha=0.4, s=30, color=colors_list[3], edgecolors='white', linewidth=0.5)
+            ax.set_yscale('log')
+            ax.set_xlabel('Language Diversity (Entropy)', fontsize=28, fontweight='bold')
+            ax.set_ylabel('Stars (log scale)', fontsize=28, fontweight='bold')
+            ax.set_title('Stars vs Language Diversity', fontsize=28, fontweight='bold')
+            corr = df_plot['language_entropy'].corr(np.log10(df_plot['stars']))
+            ax.text(0.05, 0.95, f'r = {corr:.3f}', transform=ax.transAxes,
+                   fontsize=18, fontweight='bold', verticalalignment='top',
+                   bbox=dict(boxstyle='round', facecolor='white', alpha=0.8,
+                           edgecolor=NATURE_COLORS['secondary'], linewidth=2))
+        plt.suptitle('Correlation Analysis: Stars vs Code Metrics (Top 15K Repositories)',
+                    fontsize=32, fontweight='bold', y=0.995)
+        plt.tight_layout(rect=[0, 0, 1, 0.96])
+        fig_path = self.output_dir / 'fig_insights_stars_vs_metrics.png'
+        plt.savefig(fig_path, dpi=150, bbox_inches='tight', facecolor='white')
+        plt.close()
+        print(f"Saved: {fig_path}")
+    def plot_by_keyword_comparison(self):
+        """按keyword分组对比代码指标"""
+        if self.df_joined is None:
+            self.load_and_join()
+        df = self.df_joined.copy()
+        df = df[df['keyword'].notna()]
+        # Top keywords (increased to 15 for better comparison)
+        top_keywords = df['keyword'].value_counts().head(15).index
+        df = df[df['keyword'].isin(top_keywords)]
+        if len(df) == 0:
+            print("No data for keyword comparison")
+            return
+        fig, axes = plt.subplots(2, 2, figsize=(19.2, 10.8))
+        colors_list = [NATURE_COLORS['primary'], NATURE_COLORS['success'],
+                      NATURE_COLORS['warning'], NATURE_COLORS['secondary']]
+        # 1. 平均代码行数
+        ax = axes[0, 0]
+        apply_nature_style(ax)
+        stats = df.groupby('keyword')['total_code_lines'].mean().sort_values(ascending=False)
+        stats.plot(kind='bar', ax=ax, color=colors_list[0], alpha=0.85, edgecolor='white', linewidth=1.5)
+        ax.set_title('Average Lines of Code', fontsize=28, fontweight='bold')
+        ax.set_xlabel('')
+        ax.set_ylabel('Average LOC', fontsize=28)
+        ax.tick_params(axis='x', rotation=45, labelsize=24)  # Increased font size
+        ax.tick_params(axis='y', labelsize=24)
+        # 2. 平均注释率
+        ax = axes[0, 1]
+        apply_nature_style(ax)
+        stats = df.groupby('keyword')['comment_ratio'].mean().sort_values(ascending=False)
+        stats.plot(kind='bar', ax=ax, color=colors_list[1], alpha=0.85, edgecolor='white', linewidth=1.5)
+        ax.set_title('Average Comment Ratio', fontsize=28, fontweight='bold')
+        ax.set_xlabel('')
+        ax.set_ylabel('Comment Ratio', fontsize=28)
+        ax.tick_params(axis='x', rotation=45, labelsize=24)  # Increased font size
+        ax.tick_params(axis='y', labelsize=24)
+        # 3. 平均stars（如果有）
+        ax = axes[1, 0]
+        apply_nature_style(ax)
+        if 'stars' in df.columns:
+            stats = df.groupby('keyword')['stars'].mean().sort_values(ascending=False)
+            stats.plot(kind='bar', ax=ax, color=colors_list[2], alpha=0.85, edgecolor='white', linewidth=1.5)
+            ax.set_title('Average Stars', fontsize=28, fontweight='bold')
+            ax.set_xlabel('')
+            ax.set_ylabel('Average Stars', fontsize=28)
+            ax.tick_params(axis='x', rotation=45, labelsize=24)  # Increased font size
+            ax.tick_params(axis='y', labelsize=24)
+        # 4. 语言多样性
+        ax = axes[1, 1]
+        apply_nature_style(ax)
+        stats = df.groupby('keyword')['language_entropy'].mean().sort_values(ascending=False)
+        stats.plot(kind='bar', ax=ax, color=colors_list[3], alpha=0.85, edgecolor='white', linewidth=1.5)
+        ax.set_title('Average Language Diversity', fontsize=28, fontweight='bold')
+        ax.set_xlabel('')
+        ax.set_ylabel('Language Entropy', fontsize=28)
+        ax.tick_params(axis='x', rotation=45, labelsize=24)  # Increased font size
+        ax.tick_params(axis='y', labelsize=24)
+        plt.suptitle('Code Metrics Comparison by Keyword (Top 15K Repositories)',
+                    fontsize=32, fontweight='bold', y=0.995)
+        plt.tight_layout(rect=[0, 0, 1, 0.96])
+        fig_path = self.output_dir / 'fig_insights_by_keyword.png'
+        plt.savefig(fig_path, dpi=150, bbox_inches='tight', facecolor='white')
+        plt.close()
+        print(f"Saved: {fig_path}")
+    def plot_archived_vs_active(self):
+        """对比archived与active仓库的代码特征"""
+        if self.df_joined is None:
+            self.load_and_join()
+        df = self.df_joined.copy()
+        if 'archived' not in df.columns:
+            print("No archived column in data")
+            return
+        df['is_archived'] = df['archived'].fillna(False)
+        fig, axes = plt.subplots(2, 2, figsize=(19.2, 10.8))
+        # 1. 代码行数对比
+        ax = axes[0, 0]
+        apply_nature_style(ax)
+        df_plot = df[df['total_code_lines'] > 0]
+        if len(df_plot) > 0:
+            bp = df_plot.boxplot(column='total_code_lines', by='is_archived', ax=ax,
+                                widths=0.6, patch_artist=True,
+                                boxprops=dict(facecolor=NATURE_COLORS['primary'], alpha=0.7, linewidth=2),
+                                medianprops=dict(color='white', linewidth=3),
+                                whiskerprops=dict(linewidth=2),
+                                capprops=dict(linewidth=2))
+            ax.set_title('Lines of Code: Archived vs Active', fontsize=28, fontweight='bold')
+            ax.set_xlabel('')
+            ax.set_ylabel('Lines of Code', fontsize=28)
+            ax.set_yscale('log')
+            ax.set_xticklabels(['Active', 'Archived'], fontsize=24)
+            plt.setp(ax.xaxis.get_majorticklabels(), rotation=0)
+        # 2. 注释率对比
+        ax = axes[0, 1]
+        apply_nature_style(ax)
+        df_plot = df[df['comment_ratio'].notna()]
+        if len(df_plot) > 0:
+            bp = df_plot.boxplot(column='comment_ratio', by='is_archived', ax=ax,
+                                widths=0.6, patch_artist=True,
+                                boxprops=dict(facecolor=NATURE_COLORS['success'], alpha=0.7, linewidth=2),
+                                medianprops=dict(color='white', linewidth=3),
+                                whiskerprops=dict(linewidth=2),
+                                capprops=dict(linewidth=2))
+            ax.set_title('Comment Ratio: Archived vs Active', fontsize=28, fontweight='bold')
+            ax.set_xlabel('')
+            ax.set_ylabel('Comment Ratio', fontsize=28)
+            ax.set_xticklabels(['Active', 'Archived'], fontsize=24)
+            plt.setp(ax.xaxis.get_majorticklabels(), rotation=0)
+        # 3. 函数数对比
+        ax = axes[1, 0]
+        apply_nature_style(ax)
+        df_plot = df[df['total_functions'] > 0]
+        if len(df_plot) > 0:
+            bp = df_plot.boxplot(column='total_functions', by='is_archived', ax=ax,
+                                widths=0.6, patch_artist=True,
+                                boxprops=dict(facecolor=NATURE_COLORS['accent'], alpha=0.7, linewidth=2),
+                                medianprops=dict(color='white', linewidth=3),
+                                whiskerprops=dict(linewidth=2),
+                                capprops=dict(linewidth=2))
+            ax.set_title('Number of Functions: Archived vs Active', fontsize=28, fontweight='bold')
+            ax.set_xlabel('')
+            ax.set_ylabel('Number of Functions', fontsize=28)
+            ax.set_yscale('log')
+            ax.set_xticklabels(['Active', 'Archived'], fontsize=24)
+            plt.setp(ax.xaxis.get_majorticklabels(), rotation=0)
+        # 4. 文件数对比
+        ax = axes[1, 1]
+        apply_nature_style(ax)
+        df_plot = df[df['total_files'] > 0]
+        if len(df_plot) > 0:
+            bp = df_plot.boxplot(column='total_files', by='is_archived', ax=ax,
+                                widths=0.6, patch_artist=True,
+                                boxprops=dict(facecolor=NATURE_COLORS['secondary'], alpha=0.7, linewidth=2),
+                                medianprops=dict(color='white', linewidth=3),
+                                whiskerprops=dict(linewidth=2),
+                                capprops=dict(linewidth=2))
+            ax.set_title('Number of Files: Archived vs Active', fontsize=28, fontweight='bold')
+            ax.set_xlabel('')
+            ax.set_ylabel('Number of Files', fontsize=28)
+            ax.set_yscale('log')
+            ax.set_xticklabels(['Active', 'Archived'], fontsize=24)
+            plt.setp(ax.xaxis.get_majorticklabels(), rotation=0)
+        plt.suptitle('Code Characteristics Comparison: Archived vs Active (Top 15K Repositories)',
+                    fontsize=32, fontweight='bold', y=0.995)
+        plt.tight_layout(rect=[0, 0, 1, 0.96])
+        fig_path = self.output_dir / 'fig_insights_archived_vs_active.png'
+        plt.savefig(fig_path, dpi=150, bbox_inches='tight', facecolor='white')
+        plt.close()
+        print(f"Saved: {fig_path}")
+    def run(self):
+        """执行完整分析"""
+        print("=" * 80)
+        print("关联分析与洞察")
+        print("=" * 80)
+        self.load_and_join()
+        self.analyze_correlations()
+        self.plot_stars_vs_metrics()
+        self.plot_by_keyword_comparison()
+        self.plot_archived_vs_active()
+        print(f"\n关联分析完成！结果保存在: {self.output_dir}")
+if __name__ == "__main__":
+    repos_searched_csv = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_searched.csv"
+    repo_level_csv = "/home/weifengsun/tangou1/domain_code/src/workdir/reporting/code_stats/repo_level_metrics_top15000.csv"
+    check_history_csv = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_check_history.csv"
+    output_dir = "/home/weifengsun/tangou1/domain_code/src/workdir/reporting/insights"
+    insights = JoinInsights(repos_searched_csv, repo_level_csv, check_history_csv, output_dir)
+    insights.run()