distilgpt2-werther-finetuned / finetune_werther.py
ajsbsd's picture
Upload 4 files
3dacdd0 verified
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset # Import Dataset directly
# --- 1. Define File Paths and Model Parameters ---
current_dir = os.getcwd()
cleaned_text_file = os.path.join(current_dir, "werther_cleaned_final.txt")
output_dir = os.path.join(current_dir, "fine_tuned_werther_model")
os.makedirs(output_dir, exist_ok=True)
model_max_length = 512
# --- 2. Load Tokenizer and Prepare Dataset (Manual Approach) ---
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# model.resize_token_embeddings(len(tokenizer)) # This is done AFTER model loading
print(f"Reading entire text from: {cleaned_text_file}")
try:
with open(cleaned_text_file, 'r', encoding='utf-8') as f:
full_text = f.read()
except FileNotFoundError:
print(f"Error: The file '{cleaned_text_file}' was not found.")
exit()
print("Tokenizing entire text...")
# Tokenize the entire text. No `truncation` or `return_overflowing_tokens` yet.
# We'll handle chunking manually.
tokenized_output = tokenizer(full_text)
all_input_ids = tokenized_output["input_ids"]
print(f"Total tokens in cleaned text: {len(all_input_ids)}")
# Manually create fixed-size chunks
input_blocks = []
labels_blocks = []
for i in range(0, len(all_input_ids), model_max_length):
chunk = all_input_ids[i : i + model_max_length]
# Ensure all chunks are exactly model_max_length.
# If the last chunk is shorter, pad it. For language modeling, we generally
# prefer full blocks, but padding can be useful too.
# Here, we'll only take full blocks, dropping the remainder as done previously.
if len(chunk) == model_max_length:
input_blocks.append(chunk)
labels_blocks.append(chunk.copy()) # Labels are shifted copies of input_ids internally by Trainer
# Create a Hugging Face Dataset from our manually prepared blocks
print(f"Number of processed blocks for training: {len(input_blocks)}")
# This ensures we have 'input_ids' and 'labels' columns
lm_dataset = Dataset.from_dict({
"input_ids": input_blocks,
"labels": labels_blocks
})
# --- 3. Load Model and Data Collator ---
print("Loading DistilGPT2 model...")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
# If you added a padding token earlier, resize the model's token embeddings here
# This needs to be done *after* loading the pre-trained model.
model.resize_token_embeddings(len(tokenizer))
# Data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False, # False for Causal Language Modeling (like GPT-2)
)
# --- 4. Define Training Arguments ---
print("Setting up training arguments...")
training_args = TrainingArguments(
output_dir=output_dir,
overwrite_output_dir=True,
num_train_epochs=5,
per_device_train_batch_size=8,
save_steps=1000,
save_total_limit=2,
logging_dir='./logs',
logging_steps=50,
learning_rate=2e-5,
weight_decay=0.01,
evaluation_strategy="steps",
eval_steps=1000,
)
# --- 5. Initialize and Start Trainer ---
print("Initializing Trainer...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=lm_dataset, # Pass the directly created dataset
data_collator=data_collator,
)
print("\nStarting fine-tuning...")
try:
trainer.train()
print("Fine-tuning complete!")
# --- 6. Save the Final Model ---
print(f"Saving fine-tuned model and tokenizer to {output_dir}...")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("Model and tokenizer saved successfully.")
except RuntimeError as e:
if "out of memory" in str(e):
print("\nERROR: Out of GPU memory! Try reducing `per_device_train_batch_size` (e.g., to 4 or 2) in the TrainingArguments.")
else:
raise e
except Exception as e:
print(f"\nAn error occurred during training: {e}")