distilgpt2-werther-finetuned / finetune_werther.py

Upload 4 files

3dacdd0 verified 6 months ago

4.11 kB

	import os
	from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
	from datasets import Dataset # Import Dataset directly

	# --- 1. Define File Paths and Model Parameters ---
	current_dir = os.getcwd()
	cleaned_text_file = os.path.join(current_dir, "werther_cleaned_final.txt")
	output_dir = os.path.join(current_dir, "fine_tuned_werther_model")
	os.makedirs(output_dir, exist_ok=True)
	model_max_length = 512

	# --- 2. Load Tokenizer and Prepare Dataset (Manual Approach) ---
	print("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
	if tokenizer.pad_token is None:
	tokenizer.add_special_tokens({'pad_token': '[PAD]'})
	# model.resize_token_embeddings(len(tokenizer)) # This is done AFTER model loading

	print(f"Reading entire text from: {cleaned_text_file}")
	try:
	with open(cleaned_text_file, 'r', encoding='utf-8') as f:
	full_text = f.read()
	except FileNotFoundError:
	print(f"Error: The file '{cleaned_text_file}' was not found.")
	exit()

	print("Tokenizing entire text...")
	# Tokenize the entire text. No `truncation` or `return_overflowing_tokens` yet.
	# We'll handle chunking manually.
	tokenized_output = tokenizer(full_text)
	all_input_ids = tokenized_output["input_ids"]

	print(f"Total tokens in cleaned text: {len(all_input_ids)}")

	# Manually create fixed-size chunks
	input_blocks = []
	labels_blocks = []

	for i in range(0, len(all_input_ids), model_max_length):
	chunk = all_input_ids[i : i + model_max_length]

	# Ensure all chunks are exactly model_max_length.
	# If the last chunk is shorter, pad it. For language modeling, we generally
	# prefer full blocks, but padding can be useful too.
	# Here, we'll only take full blocks, dropping the remainder as done previously.
	if len(chunk) == model_max_length:
	input_blocks.append(chunk)
	labels_blocks.append(chunk.copy()) # Labels are shifted copies of input_ids internally by Trainer

	# Create a Hugging Face Dataset from our manually prepared blocks
	print(f"Number of processed blocks for training: {len(input_blocks)}")

	# This ensures we have 'input_ids' and 'labels' columns
	lm_dataset = Dataset.from_dict({
	"input_ids": input_blocks,
	"labels": labels_blocks
	})

	# --- 3. Load Model and Data Collator ---
	print("Loading DistilGPT2 model...")
	model = AutoModelForCausalLM.from_pretrained("distilgpt2")

	# If you added a padding token earlier, resize the model's token embeddings here
	# This needs to be done after loading the pre-trained model.
	model.resize_token_embeddings(len(tokenizer))

	# Data collator for causal language modeling
	data_collator = DataCollatorForLanguageModeling(
	tokenizer=tokenizer,
	mlm=False, # False for Causal Language Modeling (like GPT-2)
	)

	# --- 4. Define Training Arguments ---
	print("Setting up training arguments...")
	training_args = TrainingArguments(
	output_dir=output_dir,
	overwrite_output_dir=True,
	num_train_epochs=5,
	per_device_train_batch_size=8,
	save_steps=1000,
	save_total_limit=2,
	logging_dir='./logs',
	logging_steps=50,
	learning_rate=2e-5,
	weight_decay=0.01,
	evaluation_strategy="steps",
	eval_steps=1000,
	)

	# --- 5. Initialize and Start Trainer ---
	print("Initializing Trainer...")
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=lm_dataset, # Pass the directly created dataset
	data_collator=data_collator,
	)

	print("\nStarting fine-tuning...")
	try:
	trainer.train()
	print("Fine-tuning complete!")

	# --- 6. Save the Final Model ---
	print(f"Saving fine-tuned model and tokenizer to {output_dir}...")
	model.save_pretrained(output_dir)
	tokenizer.save_pretrained(output_dir)
	print("Model and tokenizer saved successfully.")

	except RuntimeError as e:
	if "out of memory" in str(e):
	print("\nERROR: Out of GPU memory! Try reducing `per_device_train_batch_size` (e.g., to 4 or 2) in the TrainingArguments.")
	else:
	raise e
	except Exception as e:
	print(f"\nAn error occurred during training: {e}")