python_roberta_hf / roberta_finetune.py
WildOjisan's picture
.
899f482
import torch
import numpy as np
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
# 1. GPU/CPU ์žฅ์น˜ ์„ค์ • (ํ•™์Šต ์‹œ Trainer๊ฐ€ ์ž๋™์œผ๋กœ ์ฒ˜๋ฆฌํ•˜๋ฏ€๋กœ ํ™•์ธ์šฉ)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"์‚ฌ์šฉ ์žฅ์น˜: {device}")
# 2. ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ
MODEL_NAME = "FacebookAI/xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# ๋ถ„๋ฅ˜๋ฅผ ์œ„ํ•ด AutoModelForSequenceClassification ๋กœ๋“œ (๋ถ„๋ฅ˜ ํ—ค๋“œ๊ฐ€ ์ถ”๊ฐ€๋จ)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
# num_labels=2: ๊ธ์ •(1), ๋ถ€์ •(0)์„ ๊ตฌ๋ถ„ํ•˜๋„๋ก ์„ค์ •
# 3. ๊ฐ€์ƒ ๋ฐ์ดํ„ฐ์…‹ ์ค€๋น„ ๋ฐ ์ „์ฒ˜๋ฆฌ
# ์‹ค์ œ ํ•™์Šต ์‹œ์—๋Š” ์—ฌ๊ธฐ์— ๋‹น์‹ ์˜ ๋‹น๊ทผ๋งˆ์ผ“ ๋ฆฌ๋ทฐ ๋ฐ์ดํ„ฐ๋ฅผ ๋กœ๋“œํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
data = {
'text': [
"๋งค๋„ˆ๊ฐ€ ์ •๋ง ์ข‹์œผ์„ธ์š”! ๊ธฐ๋ถ„ ์ข‹์€ ๊ฑฐ๋ž˜์˜€์Šต๋‹ˆ๋‹ค.", # ๊ธ์ •
"๋ฌผ๊ฑด ์ƒํƒœ๊ฐ€ ๋ณ„๋กœ๊ณ  ๋‹ต๋ณ€๋„ ๋„ˆ๋ฌด ๋А๋ ธ์–ด์š”.", # ๋ถ€์ •
"๋น ๋ฅธ ์‘๋‹ต๊ณผ ๊น”๋”ํ•œ ๊ฑฐ๋ž˜ ๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค.", # ๊ธ์ •
"๊ฐ€๊ฒฉ์ด ๋„ˆ๋ฌด ๋น„์‹ธ๋„ค์š”. ๋น„์ถ”์ž…๋‹ˆ๋‹ค.", # ๋ถ€์ •
"์˜ค๋Š˜๋„ ๋งŒ์กฑ์Šค๋Ÿฌ์šด ์ค‘๊ณ  ๊ฑฐ๋ž˜์˜€์Šต๋‹ˆ๋‹ค.", # ๊ธ์ •
"์‹œ๊ฐ„ ์•ฝ์† ์•ˆ ์ง€ํ‚ค๊ณ  ์—ฐ๋ฝ๋„ ์ž˜ ์•ˆ ๋˜๋„ค์š”.", # ๋ถ€์ •
],
'label': [1, 0, 1, 0, 1, 0] # 1: ๊ธ์ •, 0: ๋ถ€์ •
}
raw_dataset = Dataset.from_dict(data)
# ๋ฐ์ดํ„ฐ์…‹์„ ํ•™์Šต(train)๊ณผ ํ‰๊ฐ€(test) ์„ธํŠธ๋กœ ๋ถ„ํ•  (์˜ˆ์‹œ์ด๋ฏ€๋กœ 50:50)
train_test_split = raw_dataset.train_test_split(test_size=0.5, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']
def tokenize_function(examples):
# ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ํ† ํฐํ™”ํ•ฉ๋‹ˆ๋‹ค.
return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
# ๋ฐ์ดํ„ฐ์…‹์— ํ† ํฌ๋‚˜์ด์ € ์ ์šฉ
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)
# 4. ํ‰๊ฐ€ ์ง€ํ‘œ ํ•จ์ˆ˜ ์ •์˜
def compute_metrics(p):
# ์˜ˆ์ธก๋œ ๋กœ์ง“(logits)์—์„œ argmax๋ฅผ ์ทจํ•ด ์˜ˆ์ธก ๋ ˆ์ด๋ธ”์„ ์–ป์Šต๋‹ˆ๋‹ค.
predictions = np.argmax(p.predictions, axis=1)
# ์ •ํ™•๋„(Accuracy)์™€ F1-Score๋ฅผ ๊ณ„์‚ฐํ•ฉ๋‹ˆ๋‹ค.
acc = accuracy_score(p.label_ids, predictions)
f1 = f1_score(p.label_ids, predictions, average='binary') # ๊ธ์ •(1)์— ๋Œ€ํ•œ F1-Score
return {"accuracy": acc, "f1": f1}
# 5. ํ•™์Šต ์„ค์ • (TrainingArguments)
OUTPUT_DIR = "./xlm-roberta-review-classifier" # ๋ชจ๋ธ์„ ์ €์žฅํ•  ๊ฒฝ๋กœ
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
num_train_epochs=3, # ํ•™์Šต ํšŸ์ˆ˜ (์‹ค์ œ ์ž‘์—… ์‹œ 3~5ํšŒ ๊ถŒ์žฅ)
per_device_train_batch_size=8, # GPU๋‹น ํ•™์Šต ๋ฐฐ์น˜ ํฌ๊ธฐ (VRAM์— ๋”ฐ๋ผ ์กฐ์ •)
per_device_eval_batch_size=8, # GPU๋‹น ํ‰๊ฐ€ ๋ฐฐ์น˜ ํฌ๊ธฐ
warmup_steps=500, # ํ•™์Šต๋ฅ ์ด ์ตœ๋Œ€์น˜์— ๋„๋‹ฌํ•˜๋Š” ๋‹จ๊ณ„ ์ˆ˜
weight_decay=0.01, # ๊ฐ€์ค‘์น˜ ๊ฐ์†Œ (์˜ค๋ฒ„ํ”ผํŒ… ๋ฐฉ์ง€)
logging_dir='./logs', # ๋กœ๊ทธ ์ €์žฅ ๊ฒฝ๋กœ
logging_steps=10,
eval_strategy="epoch", # ์—ํฌํฌ๋งˆ๋‹ค ํ‰๊ฐ€ ์ˆ˜ํ–‰
save_strategy="epoch", # ์—ํฌํฌ๋งˆ๋‹ค ๋ชจ๋ธ ์ €์žฅ
load_best_model_at_end=True, # ํ•™์Šต ์ข…๋ฃŒ ์‹œ ๊ฐ€์žฅ ์ข‹์€ ์„ฑ๋Šฅ์˜ ๋ชจ๋ธ ๋กœ๋“œ
fp16=torch.cuda.is_available(), # GPU ์‚ฌ์šฉ ์‹œ ์†๋„๋ฅผ ์œ„ํ•ด fp16 ์‚ฌ์šฉ
)
# 6. Trainer ๊ฐ์ฒด ์ƒ์„ฑ ๋ฐ ํ•™์Šต ์‹œ์ž‘
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_eval_dataset,
compute_metrics=compute_metrics,
)
print("\n--- ํŒŒ์ธ ํŠœ๋‹ ์‹œ์ž‘ ---")
trainer.train()
# 7. ์ตœ์ข… ๋ชจ๋ธ ์ €์žฅ
# ํ•™์Šต๋œ ๋ชจ๋ธ๊ณผ ํ† ํฌ๋‚˜์ด์ €๋ฅผ ์ง€์ •๋œ ๊ฒฝ๋กœ์— ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
print(f"\n--- ํŒŒ์ธ ํŠœ๋‹ ์™„๋ฃŒ, ๋ชจ๋ธ์„ {OUTPUT_DIR}์— ์ €์žฅ ์ค‘ ---")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("๋ชจ๋ธ ์ €์žฅ ์™„๋ฃŒ. ์ด์ œ ์ €์žฅ๋œ ๋ชจ๋ธ์„ ๋กœ๋“œํ•˜์—ฌ ๋ฐ”๋กœ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")