SunDou commited on
Commit
c1ef1a3
·
verified ·
1 Parent(s): d50199f

Upload data1/reporting/repo_meta_scan.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data1/reporting/repo_meta_scan.py +193 -0
data1/reporting/repo_meta_scan.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 扫描 repos_filtered/ 目录,统计仓库元画像
3
+ 文件数/大小/扩展名分布/工程化信号等
4
+ """
5
+ import os
6
+ from pathlib import Path
7
+ from collections import Counter, defaultdict
8
+ from tqdm import tqdm
9
+ import json
10
+ import statistics
11
+
12
+
13
+ class RepoMetaScan:
14
+ def __init__(self, repos_dir, output_dir, top_n=None):
15
+ self.repos_dir = Path(repos_dir)
16
+ self.output_dir = Path(output_dir)
17
+ self.output_dir.mkdir(parents=True, exist_ok=True)
18
+ self.top_n = top_n
19
+
20
+ # 工程化文件/目录标识
21
+ self.eng_signals = {
22
+ 'files': [
23
+ 'Dockerfile', 'docker-compose.yml', 'requirements.txt', 'setup.py',
24
+ 'pyproject.toml', 'package.json', 'pom.xml', 'Makefile', 'LICENSE',
25
+ 'CITATION.cff', 'CMakeLists.txt', 'Cargo.toml', 'go.mod', 'go.sum'
26
+ ],
27
+ 'dirs': [
28
+ '.github/workflows', 'tests', 'test', 'docs', 'doc', 'examples',
29
+ 'example', 'data', 'notebooks', 'notebook', 'scripts', 'script'
30
+ ]
31
+ }
32
+
33
+ self.stats = []
34
+
35
+ def get_repo_full_name(self, dir_name):
36
+ """将目录名转换为full_name (owner___repo -> owner/repo)"""
37
+ return dir_name.replace('___', '/')
38
+
39
+ def scan_repo(self, repo_path):
40
+ """扫描单个仓库"""
41
+ repo_name = repo_path.name
42
+ full_name = self.get_repo_full_name(repo_name)
43
+
44
+ stats = {
45
+ 'repo_name': repo_name,
46
+ 'full_name': full_name,
47
+ 'total_files': 0,
48
+ 'total_size_bytes': 0,
49
+ 'max_file_size_bytes': 0,
50
+ 'extensions': Counter(),
51
+ 'has_ipynb': False,
52
+ 'ipynb_count': 0,
53
+ 'eng_signals': defaultdict(bool),
54
+ 'has_git': os.path.isdir(repo_path / '.git'),
55
+ }
56
+
57
+ # 扫描文件
58
+ skip_dirs = {'.git', 'node_modules', 'vendor', 'dist', 'build', '__pycache__', '.pytest_cache'}
59
+
60
+ for root, dirs, files in os.walk(repo_path):
61
+ # 跳过不需要的目录
62
+ dirs[:] = [d for d in dirs if d not in skip_dirs]
63
+
64
+ # 检查工程化目录
65
+ rel_path = Path(root).relative_to(repo_path)
66
+ for signal_dir in self.eng_signals['dirs']:
67
+ if signal_dir in str(rel_path):
68
+ stats['eng_signals'][signal_dir] = True
69
+
70
+ for file in files:
71
+ file_path = Path(root) / file
72
+
73
+ # 文件大小
74
+ try:
75
+ file_size = file_path.stat().st_size
76
+ stats['total_size_bytes'] += file_size
77
+ stats['max_file_size_bytes'] = max(stats['max_file_size_bytes'], file_size)
78
+ except:
79
+ continue
80
+
81
+ stats['total_files'] += 1
82
+
83
+ # 扩展名
84
+ ext = file_path.suffix.lower()
85
+ if not ext:
86
+ ext = '<noext>'
87
+ stats['extensions'][ext] += 1
88
+
89
+ # Notebook检测
90
+ if ext == '.ipynb':
91
+ stats['has_ipynb'] = True
92
+ stats['ipynb_count'] += 1
93
+
94
+ # 工程化文件检测
95
+ for signal_file in self.eng_signals['files']:
96
+ if file == signal_file:
97
+ stats['eng_signals'][signal_file] = True
98
+
99
+ # 转换extensions为dict
100
+ stats['extensions'] = dict(stats['extensions'])
101
+ stats['eng_signals'] = dict(stats['eng_signals'])
102
+
103
+ return stats
104
+
105
+ def scan_all_repos(self):
106
+ """扫描所有仓库(字典序前top_n,如果top_n为None则扫描所有)"""
107
+ print(f"Scanning repos in {self.repos_dir}...")
108
+
109
+ # 获取所有仓库目录,按字典序排序
110
+ all_repos = sorted([d for d in self.repos_dir.iterdir() if d.is_dir()])
111
+ if self.top_n is None:
112
+ selected_repos = all_repos
113
+ print(f"Selected {len(selected_repos)} repos (all repos)")
114
+ else:
115
+ selected_repos = all_repos[:self.top_n]
116
+ print(f"Selected {len(selected_repos)} repos (top {self.top_n} by alphabetical order)")
117
+
118
+ for repo_path in tqdm(selected_repos, desc="Scanning repos"):
119
+ try:
120
+ stats = self.scan_repo(repo_path)
121
+ self.stats.append(stats)
122
+ except Exception as e:
123
+ print(f"Error scanning {repo_path}: {e}")
124
+ continue
125
+
126
+ def save_results(self):
127
+ """保存结果"""
128
+ import pandas as pd
129
+
130
+ # 保存仓库级统计(动态文件名)
131
+ df = pd.DataFrame(self.stats)
132
+ top_n_suffix = f"_top{self.top_n}" if self.top_n else ""
133
+ df.to_csv(self.output_dir / f'repo_meta_scan{top_n_suffix}.csv', index=False)
134
+
135
+ # 汇总统计
136
+ summary = {
137
+ 'total_repos': len(self.stats),
138
+ 'total_files': sum(s['total_files'] for s in self.stats),
139
+ 'total_size_gb': sum(s['total_size_bytes'] for s in self.stats) / (1024**3),
140
+ 'avg_files_per_repo': statistics.mean([s['total_files'] for s in self.stats]) if self.stats else 0,
141
+ 'avg_size_mb_per_repo': statistics.mean([s['total_size_bytes'] for s in self.stats]) / (1024**2) if self.stats else 0,
142
+ 'repos_with_ipynb': sum(1 for s in self.stats if s['has_ipynb']),
143
+ 'total_ipynb_files': sum(s['ipynb_count'] for s in self.stats),
144
+ }
145
+
146
+ # 扩展名Top统计
147
+ all_extensions = Counter()
148
+ for s in self.stats:
149
+ all_extensions.update(s['extensions'])
150
+
151
+ summary['top_extensions'] = dict(all_extensions.most_common(30))
152
+
153
+ # 工程化信号统计
154
+ eng_counts = defaultdict(int)
155
+ for s in self.stats:
156
+ for signal, present in s['eng_signals'].items():
157
+ if present:
158
+ eng_counts[signal] += 1
159
+
160
+ summary['engineering_signals'] = dict(eng_counts)
161
+
162
+ with open(self.output_dir / 'repo_meta_summary.json', 'w', encoding='utf-8') as f:
163
+ json.dump(summary, f, indent=2, ensure_ascii=False)
164
+
165
+ # 扩展名分布CSV
166
+ ext_df = pd.DataFrame([
167
+ {'extension': ext, 'count': count}
168
+ for ext, count in all_extensions.most_common(50)
169
+ ])
170
+ ext_df.to_csv(self.output_dir / 'extension_distribution.csv', index=False)
171
+
172
+ # 工程化信号CSV
173
+ eng_df = pd.DataFrame([
174
+ {'signal': signal, 'count': count, 'percentage': count / len(self.stats) * 100}
175
+ for signal, count in sorted(eng_counts.items(), key=lambda x: -x[1])
176
+ ])
177
+ eng_df.to_csv(self.output_dir / 'engineering_signals.csv', index=False)
178
+
179
+ def run(self):
180
+ """执行完整流程"""
181
+ print("Scanning repository metadata...")
182
+ self.scan_all_repos()
183
+ print("Saving results...")
184
+ self.save_results()
185
+ print(f"Repo meta scan complete! Results saved to {self.output_dir}")
186
+
187
+
188
+ if __name__ == "__main__":
189
+ repos_dir = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered"
190
+ output_dir = "/home/weifengsun/tangou1/domain_code/src/workdir/reporting/repo_meta"
191
+ scanner = RepoMetaScan(repos_dir, output_dir, top_n=None) # None表示所有项目
192
+ scanner.run()
193
+