himu1780 commited on
Commit
9e0bbe3
Β·
verified Β·
1 Parent(s): 924a225

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -65
app.py CHANGED
@@ -1,9 +1,6 @@
1
  """
2
- AI Python Code Model Trainer
3
- Hugging Face Space for continuous training with auto-resume
4
- Username: himu1780 | Model: ai-python-model
5
-
6
- FINAL VERSION - All optimizations applied
7
  """
8
 
9
  import os
@@ -12,7 +9,7 @@ import gradio as gr
12
  import threading
13
  import time
14
  from datetime import datetime
15
- from huggingface_hub import HfApi, login
16
  from transformers import (
17
  AutoModelForCausalLM,
18
  AutoTokenizer,
@@ -22,7 +19,6 @@ from transformers import (
22
  )
23
  from datasets import load_dataset, Dataset
24
 
25
- # Try to import torch for memory cleanup
26
  try:
27
  import torch
28
  TORCH_AVAILABLE = True
@@ -35,7 +31,6 @@ MODEL_REPO = f"{HF_USERNAME}/ai-python-model"
35
  DATASET_NAME = "jtatman/python-code-dataset-500k"
36
  BASE_MODEL = "gpt2"
37
 
38
- # Training hyperparameters (Memory optimized)
39
  BATCH_SIZE = 1
40
  GRADIENT_ACCUMULATION = 8
41
  SAVE_STEPS = 500
@@ -45,9 +40,8 @@ LEARNING_RATE = 5e-5
45
  MAX_STEPS_PER_SESSION = 10000
46
  EXAMPLES_PER_SESSION = 50000
47
 
48
- # Continuous training settings
49
- CONTINUOUS_TRAINING = True # Set False to stop after one session
50
- WAIT_BETWEEN_SESSIONS = 60 # Seconds to wait before next session
51
 
52
  # ============ GLOBAL STATE ============
53
  training_status = {
@@ -58,13 +52,13 @@ training_status = {
58
  "start_time": None,
59
  "message": "Initializing...",
60
  "session_count": 0,
 
61
  }
62
 
63
  stop_requested = False
64
 
65
  # ============ MEMORY CLEANUP ============
66
  def cleanup_memory():
67
- """Free up memory after training"""
68
  gc.collect()
69
  if TORCH_AVAILABLE and torch.cuda.is_available():
70
  torch.cuda.empty_cache()
@@ -72,19 +66,26 @@ def cleanup_memory():
72
 
73
  # ============ AUTHENTICATION ============
74
  def authenticate():
75
- """Login to Hugging Face Hub"""
76
  token = os.environ.get("HF_TOKEN")
77
  if token:
78
  login(token=token)
79
  training_status["message"] = "βœ… Authenticated with Hugging Face"
 
 
 
 
 
 
 
 
80
  return True
81
  else:
82
  training_status["message"] = "❌ HF_TOKEN not found in secrets!"
 
83
  return False
84
 
85
  # ============ MODEL LOADING ============
86
  def load_model_and_tokenizer():
87
- """Load model from Hub (resume) or start fresh from base model"""
88
  global training_status
89
 
90
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
@@ -92,7 +93,7 @@ def load_model_and_tokenizer():
92
 
93
  try:
94
  training_status["message"] = f"πŸ”„ Attempting to resume from {MODEL_REPO}..."
95
- model = AutoModelForCausalLM.from_pretrained(MODEL_REPO)
96
  training_status["message"] = f"βœ… Resumed from {MODEL_REPO}"
97
  print(f"[INFO] Resumed training from {MODEL_REPO}")
98
  except Exception as e:
@@ -104,68 +105,92 @@ def load_model_and_tokenizer():
104
 
105
  # ============ DATASET PROCESSING ============
106
  def prepare_dataset(tokenizer):
107
- """Load and prepare dataset"""
108
  global training_status
109
- training_status["message"] = "πŸ“₯ Loading dataset (streaming mode)..."
110
 
111
  try:
 
112
  dataset = load_dataset(DATASET_NAME, split="train", streaming=True)
 
 
113
  dataset = dataset.take(EXAMPLES_PER_SESSION)
114
 
115
- def tokenize_function(examples):
116
- texts = []
117
- instructions = examples.get("instruction", [])
118
- outputs = examples.get("output", [])
119
-
120
- for instruction, output in zip(instructions, outputs):
121
- if instruction and output:
122
- text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
123
- texts.append(text)
124
-
125
- if not texts:
126
- texts = [""]
127
-
128
- result = tokenizer(
129
- texts,
130
- truncation=True,
131
- max_length=MAX_LENGTH,
132
- padding="max_length",
133
- return_tensors=None,
134
- )
135
- result["labels"] = result["input_ids"].copy()
136
- return result
137
 
138
- tokenized_dataset = dataset.map(
139
- tokenize_function,
140
- batched=True,
141
- batch_size=100,
142
- remove_columns=["instruction", "output"],
143
- )
144
 
145
- training_status["message"] = "πŸ”„ Converting dataset for Trainer..."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
- all_examples = []
148
- for i, example in enumerate(tokenized_dataset):
149
- all_examples.append(example)
150
- if i % 5000 == 0:
151
- training_status["message"] = f"πŸ“₯ Loaded {i:,}/{EXAMPLES_PER_SESSION:,} examples..."
152
- if i >= EXAMPLES_PER_SESSION - 1:
153
- break
154
 
 
155
  train_dataset = Dataset.from_list(all_examples)
156
 
157
  training_status["message"] = f"βœ… Dataset ready: {len(train_dataset):,} examples"
 
158
  return train_dataset
159
 
160
  except Exception as e:
161
  training_status["message"] = f"❌ Dataset error: {str(e)}"
 
162
  print(f"[ERROR] Dataset preparation failed: {e}")
 
 
163
  raise e
164
 
165
  # ============ CUSTOM TRAINER ============
166
  class StatusTrainer(Trainer):
167
- """Custom trainer with status updates and stop support"""
168
-
169
  def training_step(self, model, inputs):
170
  global stop_requested
171
  if stop_requested:
@@ -184,7 +209,6 @@ class StatusTrainer(Trainer):
184
 
185
  # ============ SINGLE TRAINING SESSION ============
186
  def run_training_session():
187
- """Run a single training session"""
188
  global training_status, stop_requested
189
 
190
  model = None
@@ -197,6 +221,10 @@ def run_training_session():
197
  model, tokenizer = load_model_and_tokenizer()
198
  train_dataset = prepare_dataset(tokenizer)
199
 
 
 
 
 
200
  data_collator = DataCollatorForLanguageModeling(
201
  tokenizer=tokenizer,
202
  mlm=False,
@@ -220,7 +248,7 @@ def run_training_session():
220
  max_steps=MAX_STEPS_PER_SESSION,
221
  fp16=False,
222
  dataloader_num_workers=0,
223
- remove_unused_columns=False,
224
  )
225
 
226
  trainer = StatusTrainer(
@@ -228,11 +256,14 @@ def run_training_session():
228
  args=training_args,
229
  train_dataset=train_dataset,
230
  data_collator=data_collator,
231
- tokenizer=tokenizer,
232
  )
233
 
234
  training_status["message"] = "πŸƒ Training in progress..."
 
235
  trainer.train()
 
 
236
  trainer.push_to_hub()
237
 
238
  training_status["session_count"] += 1
@@ -243,18 +274,21 @@ def run_training_session():
243
  training_status["message"] = "⏹️ Training stopped by user"
244
  return False
245
  except Exception as e:
246
- training_status["message"] = f"❌ Error: {str(e)}"
 
247
  print(f"[ERROR] Training failed: {e}")
248
  import traceback
249
  traceback.print_exc()
250
  return False
251
  finally:
252
- del model, trainer
 
 
 
253
  cleanup_memory()
254
 
255
  # ============ MAIN TRAINING LOOP ============
256
  def start_training():
257
- """Main training function with continuous loop"""
258
  global training_status, stop_requested
259
 
260
  if training_status["is_training"]:
@@ -289,7 +323,6 @@ def start_training():
289
 
290
  # ============ GRADIO INTERFACE ============
291
  def get_status():
292
- """Get current training status"""
293
  elapsed = ""
294
  if training_status["start_time"]:
295
  delta = datetime.now() - training_status["start_time"]
@@ -306,6 +339,7 @@ def get_status():
306
  continuous_str = "βœ… Enabled" if CONTINUOUS_TRAINING else "❌ Disabled"
307
  elapsed_str = elapsed if elapsed else "N/A"
308
  effective_batch = BATCH_SIZE * GRADIENT_ACCUMULATION
 
309
 
310
  return f"""
311
  ## πŸ€– AI Python Model Trainer
@@ -316,6 +350,7 @@ def get_status():
316
  | **State** | {state_str} |
317
  | **Message** | {training_status["message"]} |
318
  | **Sessions Completed** | {training_status["session_count"]} |
 
319
 
320
  ### Progress
321
  | Metric | Value |
@@ -335,7 +370,6 @@ def get_status():
335
  """
336
 
337
  def start_training_async():
338
- """Start training in background"""
339
  if training_status["is_training"]:
340
  return "⚠️ Training already in progress!"
341
  thread = threading.Thread(target=start_training, daemon=True)
@@ -343,7 +377,6 @@ def start_training_async():
343
  return "πŸš€ Training started in background!"
344
 
345
  def stop_training():
346
- """Stop training"""
347
  global stop_requested
348
  if not training_status["is_training"]:
349
  return "⚠️ No training in progress"
@@ -353,7 +386,6 @@ def stop_training():
353
 
354
  # ============ AUTO-START ============
355
  def auto_start():
356
- """Auto-start continuous training on Space launch"""
357
  time.sleep(10)
358
  while True:
359
  if not training_status["is_training"] and not stop_requested:
 
1
  """
2
+ AI Python Code Model Trainer - FIXED VERSION
3
+ Dataset: jtatman/python-code-dataset-500k
 
 
 
4
  """
5
 
6
  import os
 
9
  import threading
10
  import time
11
  from datetime import datetime
12
+ from huggingface_hub import HfApi, login, create_repo
13
  from transformers import (
14
  AutoModelForCausalLM,
15
  AutoTokenizer,
 
19
  )
20
  from datasets import load_dataset, Dataset
21
 
 
22
  try:
23
  import torch
24
  TORCH_AVAILABLE = True
 
31
  DATASET_NAME = "jtatman/python-code-dataset-500k"
32
  BASE_MODEL = "gpt2"
33
 
 
34
  BATCH_SIZE = 1
35
  GRADIENT_ACCUMULATION = 8
36
  SAVE_STEPS = 500
 
40
  MAX_STEPS_PER_SESSION = 10000
41
  EXAMPLES_PER_SESSION = 50000
42
 
43
+ CONTINUOUS_TRAINING = True
44
+ WAIT_BETWEEN_SESSIONS = 60
 
45
 
46
  # ============ GLOBAL STATE ============
47
  training_status = {
 
52
  "start_time": None,
53
  "message": "Initializing...",
54
  "session_count": 0,
55
+ "last_error": "",
56
  }
57
 
58
  stop_requested = False
59
 
60
  # ============ MEMORY CLEANUP ============
61
  def cleanup_memory():
 
62
  gc.collect()
63
  if TORCH_AVAILABLE and torch.cuda.is_available():
64
  torch.cuda.empty_cache()
 
66
 
67
  # ============ AUTHENTICATION ============
68
  def authenticate():
 
69
  token = os.environ.get("HF_TOKEN")
70
  if token:
71
  login(token=token)
72
  training_status["message"] = "βœ… Authenticated with Hugging Face"
73
+
74
+ try:
75
+ api = HfApi()
76
+ api.create_repo(repo_id=MODEL_REPO, exist_ok=True)
77
+ print(f"[INFO] Repo {MODEL_REPO} ready")
78
+ except Exception as e:
79
+ print(f"[WARN] Repo check: {e}")
80
+
81
  return True
82
  else:
83
  training_status["message"] = "❌ HF_TOKEN not found in secrets!"
84
+ training_status["last_error"] = "Add HF_TOKEN to Space secrets"
85
  return False
86
 
87
  # ============ MODEL LOADING ============
88
  def load_model_and_tokenizer():
 
89
  global training_status
90
 
91
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
 
93
 
94
  try:
95
  training_status["message"] = f"πŸ”„ Attempting to resume from {MODEL_REPO}..."
96
+ model = AutoModelForCausalLM.from_pretrained(MODEL_REPO, trust_remote_code=True)
97
  training_status["message"] = f"βœ… Resumed from {MODEL_REPO}"
98
  print(f"[INFO] Resumed training from {MODEL_REPO}")
99
  except Exception as e:
 
105
 
106
  # ============ DATASET PROCESSING ============
107
  def prepare_dataset(tokenizer):
 
108
  global training_status
109
+ training_status["message"] = "πŸ“₯ Loading dataset..."
110
 
111
  try:
112
+ # Load dataset in streaming mode
113
  dataset = load_dataset(DATASET_NAME, split="train", streaming=True)
114
+
115
+ # Take only what we need
116
  dataset = dataset.take(EXAMPLES_PER_SESSION)
117
 
118
+ training_status["message"] = "πŸ”„ Processing examples..."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
+ all_examples = []
121
+ count = 0
 
 
 
 
122
 
123
+ for example in dataset:
124
+ try:
125
+ # Get instruction and output from dataset
126
+ # This dataset has: instruction, output, system columns
127
+ instruction = example.get("instruction", "")
128
+ output = example.get("output", "")
129
+
130
+ # Skip if empty
131
+ if not instruction or not output:
132
+ continue
133
+
134
+ # Make sure they are strings
135
+ if not isinstance(instruction, str):
136
+ instruction = str(instruction)
137
+ if not isinstance(output, str):
138
+ output = str(output)
139
+
140
+ # Create training text
141
+ text = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
142
+
143
+ # Tokenize
144
+ tokenized = tokenizer(
145
+ text,
146
+ truncation=True,
147
+ max_length=MAX_LENGTH,
148
+ padding="max_length",
149
+ return_tensors=None,
150
+ )
151
+
152
+ # Create example with only needed fields
153
+ processed_example = {
154
+ "input_ids": tokenized["input_ids"],
155
+ "attention_mask": tokenized["attention_mask"],
156
+ "labels": tokenized["input_ids"].copy(),
157
+ }
158
+
159
+ all_examples.append(processed_example)
160
+ count += 1
161
+
162
+ # Progress update
163
+ if count % 5000 == 0:
164
+ training_status["message"] = f"πŸ“₯ Processed {count:,}/{EXAMPLES_PER_SESSION:,} examples..."
165
+ print(f"[INFO] Processed {count:,} examples...")
166
+
167
+ if count >= EXAMPLES_PER_SESSION:
168
+ break
169
+
170
+ except Exception as e:
171
+ # Skip problematic examples
172
+ continue
173
 
174
+ if len(all_examples) == 0:
175
+ raise ValueError("No valid examples found in dataset!")
 
 
 
 
 
176
 
177
+ # Create HuggingFace Dataset
178
  train_dataset = Dataset.from_list(all_examples)
179
 
180
  training_status["message"] = f"βœ… Dataset ready: {len(train_dataset):,} examples"
181
+ print(f"[INFO] Dataset ready: {len(train_dataset):,} examples")
182
  return train_dataset
183
 
184
  except Exception as e:
185
  training_status["message"] = f"❌ Dataset error: {str(e)}"
186
+ training_status["last_error"] = str(e)
187
  print(f"[ERROR] Dataset preparation failed: {e}")
188
+ import traceback
189
+ traceback.print_exc()
190
  raise e
191
 
192
  # ============ CUSTOM TRAINER ============
193
  class StatusTrainer(Trainer):
 
 
194
  def training_step(self, model, inputs):
195
  global stop_requested
196
  if stop_requested:
 
209
 
210
  # ============ SINGLE TRAINING SESSION ============
211
  def run_training_session():
 
212
  global training_status, stop_requested
213
 
214
  model = None
 
221
  model, tokenizer = load_model_and_tokenizer()
222
  train_dataset = prepare_dataset(tokenizer)
223
 
224
+ if len(train_dataset) == 0:
225
+ training_status["message"] = "❌ Empty dataset!"
226
+ return False
227
+
228
  data_collator = DataCollatorForLanguageModeling(
229
  tokenizer=tokenizer,
230
  mlm=False,
 
248
  max_steps=MAX_STEPS_PER_SESSION,
249
  fp16=False,
250
  dataloader_num_workers=0,
251
+ remove_unused_columns=True,
252
  )
253
 
254
  trainer = StatusTrainer(
 
256
  args=training_args,
257
  train_dataset=train_dataset,
258
  data_collator=data_collator,
259
+ processing_class=tokenizer,
260
  )
261
 
262
  training_status["message"] = "πŸƒ Training in progress..."
263
+ print("[INFO] Starting training...")
264
  trainer.train()
265
+
266
+ print("[INFO] Pushing to hub...")
267
  trainer.push_to_hub()
268
 
269
  training_status["session_count"] += 1
 
274
  training_status["message"] = "⏹️ Training stopped by user"
275
  return False
276
  except Exception as e:
277
+ training_status["message"] = f"❌ Error: {str(e)[:100]}"
278
+ training_status["last_error"] = str(e)
279
  print(f"[ERROR] Training failed: {e}")
280
  import traceback
281
  traceback.print_exc()
282
  return False
283
  finally:
284
+ if model is not None:
285
+ del model
286
+ if trainer is not None:
287
+ del trainer
288
  cleanup_memory()
289
 
290
  # ============ MAIN TRAINING LOOP ============
291
  def start_training():
 
292
  global training_status, stop_requested
293
 
294
  if training_status["is_training"]:
 
323
 
324
  # ============ GRADIO INTERFACE ============
325
  def get_status():
 
326
  elapsed = ""
327
  if training_status["start_time"]:
328
  delta = datetime.now() - training_status["start_time"]
 
339
  continuous_str = "βœ… Enabled" if CONTINUOUS_TRAINING else "❌ Disabled"
340
  elapsed_str = elapsed if elapsed else "N/A"
341
  effective_batch = BATCH_SIZE * GRADIENT_ACCUMULATION
342
+ error_str = training_status["last_error"][:100] if training_status["last_error"] else "None"
343
 
344
  return f"""
345
  ## πŸ€– AI Python Model Trainer
 
350
  | **State** | {state_str} |
351
  | **Message** | {training_status["message"]} |
352
  | **Sessions Completed** | {training_status["session_count"]} |
353
+ | **Last Error** | {error_str} |
354
 
355
  ### Progress
356
  | Metric | Value |
 
370
  """
371
 
372
  def start_training_async():
 
373
  if training_status["is_training"]:
374
  return "⚠️ Training already in progress!"
375
  thread = threading.Thread(target=start_training, daemon=True)
 
377
  return "πŸš€ Training started in background!"
378
 
379
  def stop_training():
 
380
  global stop_requested
381
  if not training_status["is_training"]:
382
  return "⚠️ No training in progress"
 
386
 
387
  # ============ AUTO-START ============
388
  def auto_start():
 
389
  time.sleep(10)
390
  while True:
391
  if not training_status["is_training"] and not stop_requested: