|
|
import os |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling |
|
|
from datasets import Dataset |
|
|
|
|
|
|
|
|
current_dir = os.getcwd() |
|
|
cleaned_text_file = os.path.join(current_dir, "werther_cleaned_final.txt") |
|
|
output_dir = os.path.join(current_dir, "fine_tuned_werther_model") |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
model_max_length = 512 |
|
|
|
|
|
|
|
|
print("Loading tokenizer...") |
|
|
tokenizer = AutoTokenizer.from_pretrained("distilgpt2") |
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
|
|
|
|
|
|
|
|
print(f"Reading entire text from: {cleaned_text_file}") |
|
|
try: |
|
|
with open(cleaned_text_file, 'r', encoding='utf-8') as f: |
|
|
full_text = f.read() |
|
|
except FileNotFoundError: |
|
|
print(f"Error: The file '{cleaned_text_file}' was not found.") |
|
|
exit() |
|
|
|
|
|
print("Tokenizing entire text...") |
|
|
|
|
|
|
|
|
tokenized_output = tokenizer(full_text) |
|
|
all_input_ids = tokenized_output["input_ids"] |
|
|
|
|
|
print(f"Total tokens in cleaned text: {len(all_input_ids)}") |
|
|
|
|
|
|
|
|
input_blocks = [] |
|
|
labels_blocks = [] |
|
|
|
|
|
for i in range(0, len(all_input_ids), model_max_length): |
|
|
chunk = all_input_ids[i : i + model_max_length] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(chunk) == model_max_length: |
|
|
input_blocks.append(chunk) |
|
|
labels_blocks.append(chunk.copy()) |
|
|
|
|
|
|
|
|
print(f"Number of processed blocks for training: {len(input_blocks)}") |
|
|
|
|
|
|
|
|
lm_dataset = Dataset.from_dict({ |
|
|
"input_ids": input_blocks, |
|
|
"labels": labels_blocks |
|
|
}) |
|
|
|
|
|
|
|
|
print("Loading DistilGPT2 model...") |
|
|
model = AutoModelForCausalLM.from_pretrained("distilgpt2") |
|
|
|
|
|
|
|
|
|
|
|
model.resize_token_embeddings(len(tokenizer)) |
|
|
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
|
tokenizer=tokenizer, |
|
|
mlm=False, |
|
|
) |
|
|
|
|
|
|
|
|
print("Setting up training arguments...") |
|
|
training_args = TrainingArguments( |
|
|
output_dir=output_dir, |
|
|
overwrite_output_dir=True, |
|
|
num_train_epochs=5, |
|
|
per_device_train_batch_size=8, |
|
|
save_steps=1000, |
|
|
save_total_limit=2, |
|
|
logging_dir='./logs', |
|
|
logging_steps=50, |
|
|
learning_rate=2e-5, |
|
|
weight_decay=0.01, |
|
|
evaluation_strategy="steps", |
|
|
eval_steps=1000, |
|
|
) |
|
|
|
|
|
|
|
|
print("Initializing Trainer...") |
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=lm_dataset, |
|
|
data_collator=data_collator, |
|
|
) |
|
|
|
|
|
print("\nStarting fine-tuning...") |
|
|
try: |
|
|
trainer.train() |
|
|
print("Fine-tuning complete!") |
|
|
|
|
|
|
|
|
print(f"Saving fine-tuned model and tokenizer to {output_dir}...") |
|
|
model.save_pretrained(output_dir) |
|
|
tokenizer.save_pretrained(output_dir) |
|
|
print("Model and tokenizer saved successfully.") |
|
|
|
|
|
except RuntimeError as e: |
|
|
if "out of memory" in str(e): |
|
|
print("\nERROR: Out of GPU memory! Try reducing `per_device_train_batch_size` (e.g., to 4 or 2) in the TrainingArguments.") |
|
|
else: |
|
|
raise e |
|
|
except Exception as e: |
|
|
print(f"\nAn error occurred during training: {e}") |
|
|
|