SunDou commited on
Commit
ca20329
·
verified ·
1 Parent(s): b736b8f

Remove upload_code_to_hf.py

Browse files
Files changed (1) hide show
  1. upload_code_to_hf.py +0 -267
upload_code_to_hf.py DELETED
@@ -1,267 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Upload dataset_builder code repository to Hugging Face Hub.
4
-
5
- Usage:
6
- # Upload to your personal account:
7
- python upload_code_to_hf.py --hf_user YOUR_USERNAME
8
-
9
- # Upload to an organization:
10
- python upload_code_to_hf.py --hf_user YOUR_ORG
11
-
12
- # Custom repository name:
13
- python upload_code_to_hf.py --hf_user YOUR_USERNAME --repo_name my-dataset-builder
14
-
15
- # Upload to a model repository (default):
16
- python upload_code_to_hf.py --hf_user YOUR_USERNAME
17
-
18
- # Upload to a space repository:
19
- python upload_code_to_hf.py --hf_user YOUR_USERNAME --repo_type space
20
- """
21
-
22
- import os
23
- import argparse
24
- import logging
25
- from pathlib import Path
26
- from huggingface_hub import HfApi, create_repo
27
- from huggingface_hub.utils import HfHubHTTPError
28
- import time
29
-
30
- logging.basicConfig(
31
- level=logging.INFO,
32
- format="%(asctime)s [%(levelname)s] %(message)s",
33
- datefmt="%Y-%m-%d %H:%M:%S",
34
- )
35
- logger = logging.getLogger(__name__)
36
-
37
- # Files/directories to exclude from upload
38
- EXCLUDE_PATTERNS = {
39
- "__pycache__",
40
- "*.pyc",
41
- "*.pyo",
42
- "*.pyd",
43
- ".git",
44
- ".gitignore",
45
- ".DS_Store",
46
- "*.log",
47
- "*.swp",
48
- "*.swo",
49
- "*~",
50
- ".pytest_cache",
51
- ".mypy_cache",
52
- ".ruff_cache",
53
- "*.egg-info",
54
- "dist",
55
- "build",
56
- ".venv",
57
- "venv",
58
- "env",
59
- ".env",
60
- "node_modules",
61
- ".idea",
62
- ".vscode",
63
- ".cursor",
64
- }
65
-
66
- # Files to always include (even if they match exclude patterns)
67
- ALWAYS_INCLUDE = {
68
- ".gitignore",
69
- "README.md",
70
- "requirements.txt",
71
- "setup.py",
72
- "pyproject.toml",
73
- }
74
-
75
-
76
- def should_exclude(file_path: Path, root: Path) -> bool:
77
- """Check if a file should be excluded from upload."""
78
- rel_path = file_path.relative_to(root)
79
-
80
- # Always include certain files
81
- if rel_path.name in ALWAYS_INCLUDE:
82
- return False
83
-
84
- # Check directory names
85
- for part in rel_path.parts:
86
- if part in EXCLUDE_PATTERNS:
87
- return True
88
- if part.startswith(".") and part not in ALWAYS_INCLUDE:
89
- return True
90
-
91
- # Check file extensions
92
- if file_path.suffix in {".pyc", ".pyo", ".pyd"}:
93
- return True
94
-
95
- # Check for log files
96
- if file_path.suffix == ".log":
97
- return True
98
-
99
- return False
100
-
101
-
102
- def get_files_to_upload(root: Path) -> list[Path]:
103
- """Get all files to upload, excluding patterns."""
104
- files = []
105
- for file_path in root.rglob("*"):
106
- if file_path.is_file() and not should_exclude(file_path, root):
107
- files.append(file_path)
108
- return sorted(files)
109
-
110
-
111
- def upload_code_repo(
112
- api: HfApi,
113
- repo_id: str,
114
- code_dir: Path,
115
- repo_type: str = "model",
116
- delay_between_files: float = 1.0,
117
- ):
118
- """Upload code repository to Hugging Face Hub."""
119
- logger.info(f"Uploading code from {code_dir} to {repo_id} (type: {repo_type})")
120
-
121
- # Create repo
122
- create_repo(repo_id, repo_type=repo_type, exist_ok=True, private=False)
123
- logger.info(f"Repository {repo_id} created/verified.")
124
-
125
- # Get all files to upload
126
- files = get_files_to_upload(code_dir)
127
- total = len(files)
128
- logger.info(f"Found {total} files to upload.")
129
-
130
- if total == 0:
131
- logger.warning("No files to upload!")
132
- return
133
-
134
- successful = 0
135
- failed = 0
136
-
137
- for idx, file_path in enumerate(files, 1):
138
- # Calculate relative path in repository
139
- rel_path = file_path.relative_to(code_dir)
140
- path_in_repo = str(rel_path).replace("\\", "/") # Normalize path separators
141
-
142
- size_kb = file_path.stat().st_size / 1024
143
- logger.info(f"[{idx}/{total}] Uploading {path_in_repo} ({size_kb:.1f} KB)...")
144
-
145
- try:
146
- # Read file content
147
- with open(file_path, "rb") as f:
148
- content = f.read()
149
-
150
- # Upload file
151
- for attempt in range(1, 4): # Max 3 retries
152
- try:
153
- api.upload_file(
154
- path_or_fileobj=content,
155
- path_in_repo=path_in_repo,
156
- repo_id=repo_id,
157
- repo_type=repo_type,
158
- )
159
- successful += 1
160
- logger.info(f"[{idx}/{total}] ✓ {path_in_repo} uploaded.")
161
- break
162
- except HfHubHTTPError as e:
163
- status_code = getattr(e, 'status_code', None) or (
164
- e.response.status_code if hasattr(e, 'response') and e.response else None
165
- )
166
- if status_code == 429: # Rate limited
167
- wait_time = min(5.0 * (2 ** (attempt - 1)), 60.0)
168
- logger.warning(
169
- f" Rate limited (429). Waiting {wait_time:.1f}s (attempt {attempt}/3)..."
170
- )
171
- time.sleep(wait_time)
172
- continue
173
- else:
174
- raise
175
- except Exception as e:
176
- if attempt == 3:
177
- raise
178
- wait_time = 2.0 * attempt
179
- logger.warning(f" Error: {e}. Waiting {wait_time:.1f}s (attempt {attempt}/3)...")
180
- time.sleep(wait_time)
181
-
182
- except Exception as e:
183
- failed += 1
184
- logger.error(f"[{idx}/{total}] ✗ Failed to upload {path_in_repo}: {e}")
185
-
186
- # Add delay between files (except for last file)
187
- if idx < total:
188
- time.sleep(delay_between_files)
189
-
190
- logger.info(f"Upload complete: {successful} successful, {failed} failed out of {total} files.")
191
-
192
-
193
- def main():
194
- parser = argparse.ArgumentParser(
195
- description="Upload dataset_builder code repository to Hugging Face Hub",
196
- formatter_class=argparse.RawDescriptionHelpFormatter,
197
- epilog=__doc__,
198
- )
199
- parser.add_argument(
200
- "--hf_user", type=str, required=True,
201
- help="Hugging Face username or organization name",
202
- )
203
- parser.add_argument(
204
- "--repo_name", type=str, default="dataset-builder",
205
- help="Repository name (default: dataset-builder)",
206
- )
207
- parser.add_argument(
208
- "--repo_type", type=str, default="model", choices=["model", "space"],
209
- help="Repository type (default: model)",
210
- )
211
- parser.add_argument(
212
- "--code_dir", type=str, default=None,
213
- help="Code directory to upload (default: current directory)",
214
- )
215
- parser.add_argument(
216
- "--delay", type=float, default=1.0,
217
- help="Delay in seconds between file uploads (default: 1.0)",
218
- )
219
- args = parser.parse_args()
220
-
221
- # Determine code directory
222
- if args.code_dir:
223
- code_dir = Path(args.code_dir).resolve()
224
- else:
225
- code_dir = Path(__file__).parent.resolve()
226
-
227
- if not code_dir.exists():
228
- logger.error(f"Code directory not found: {code_dir}")
229
- return
230
-
231
- # Verify authentication
232
- api = HfApi()
233
- try:
234
- user_info = api.whoami()
235
- logger.info(f"Logged in as: {user_info.get('name', user_info.get('fullname', 'unknown'))}")
236
- except Exception:
237
- logger.error(
238
- "Not logged in to Hugging Face. Please run:\n"
239
- " huggingface-cli login\n"
240
- "or set the HF_TOKEN environment variable."
241
- )
242
- return
243
-
244
- repo_id = f"{args.hf_user}/{args.repo_name}"
245
-
246
- logger.info("=" * 60)
247
- logger.info(f"Upload Plan:")
248
- logger.info(f" Code directory: {code_dir}")
249
- logger.info(f" Repository: {repo_id} (type: {args.repo_type})")
250
- logger.info("=" * 60)
251
-
252
- try:
253
- upload_code_repo(
254
- api=api,
255
- repo_id=repo_id,
256
- code_dir=code_dir,
257
- repo_type=args.repo_type,
258
- delay_between_files=args.delay,
259
- )
260
- logger.info("✓ Code repository upload completed!")
261
- except Exception as e:
262
- logger.error(f"✗ Code repository upload failed: {e}")
263
-
264
-
265
- if __name__ == "__main__":
266
- main()
267
-