python_roberta_hf / xtreme_distil_finetine.py
WildOjisan's picture
.
899f482
import torch
import numpy as np
from datasets import Dataset
# IntervalStrategy๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ์ž„ํฌํŠธํ•˜์—ฌ ๋ฒ„์ „ ์ถฉ๋Œ์„ ๋ฐฉ์ง€ํ•ฉ๋‹ˆ๋‹ค.
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, IntervalStrategy
from sklearn.metrics import accuracy_score, f1_score
# 1. GPU/CPU ์žฅ์น˜ ์„ค์ •
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"์‚ฌ์šฉ ์žฅ์น˜: {device}")
# 2. ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ (๊ฒฝ๋Ÿ‰ ๋ชจ๋ธ ์‚ฌ์šฉ)
MODEL_NAME = "microsoft/xtremedistil-l12-h384-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# AutoModelForSequenceClassification์„ ๋กœ๋“œํ•˜์—ฌ ๋ถ„๋ฅ˜์ธต์„ ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
# num_labels=2: ์ด์ง„ ๋ถ„๋ฅ˜ (๊ธ์ •: 1, ๋ถ€์ •: 0)
# 3. ๊ฐ€์ƒ ๋ฐ์ดํ„ฐ์…‹ ์ค€๋น„ ๋ฐ ์ „์ฒ˜๋ฆฌ
# ์‹ค์ œ ์‚ฌ์šฉ ์‹œ์—๋Š” ์ด ๋ถ€๋ถ„์„ ๋‹น์‹ ์˜ ํ•œ๊ตญ์–ด ๋ฆฌ๋ทฐ ๋ฐ์ดํ„ฐ๋กœ ๋Œ€์ฒดํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
data = {
'text': [
"๋งค๋„ˆ๊ฐ€ ์ •๋ง ์ข‹์œผ์„ธ์š”! ๊ธฐ๋ถ„ ์ข‹์€ ๊ฑฐ๋ž˜์˜€์Šต๋‹ˆ๋‹ค.",
"๋ฌผ๊ฑด ์ƒํƒœ๊ฐ€ ๋ณ„๋กœ๊ณ  ๋‹ต๋ณ€๋„ ๋„ˆ๋ฌด ๋А๋ ธ์–ด์š”.",
"๋น ๋ฅธ ์‘๋‹ต๊ณผ ๊น”๋”ํ•œ ๊ฑฐ๋ž˜ ๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค.",
"๊ฐ€๊ฒฉ์ด ๋„ˆ๋ฌด ๋น„์‹ธ๋„ค์š”. ๋น„์ถ”์ž…๋‹ˆ๋‹ค.",
"์˜ค๋Š˜๋„ ๋งŒ์กฑ์Šค๋Ÿฌ์šด ์ค‘๊ณ  ๊ฑฐ๋ž˜์˜€์Šต๋‹ˆ๋‹ค.",
"์‹œ๊ฐ„ ์•ฝ์† ์•ˆ ์ง€ํ‚ค๊ณ  ์—ฐ๋ฝ๋„ ์ž˜ ์•ˆ ๋˜๋„ค์š”.",
"์นœ์ ˆํ•จ ๋•๋ถ„์— ๊ฑฐ๋ž˜ ๊ณผ์ •์ด ์ˆœ์กฐ๋กœ์› ์Šต๋‹ˆ๋‹ค.",
"ํŒ๋งค๊ธ€๊ณผ ์‹ค์ œ ์ œํ’ˆ์ด ๋‹ฌ๋ผ์„œ ์‹ค๋งํ–ˆ์Šต๋‹ˆ๋‹ค.",
],
'label': [1, 0, 1, 0, 1, 0, 1, 0] # 1: ๊ธ์ •, 0: ๋ถ€์ •
}
raw_dataset = Dataset.from_dict(data)
# ๋ฐ์ดํ„ฐ์…‹์„ ํ•™์Šต(train)๊ณผ ํ‰๊ฐ€(test) ์„ธํŠธ๋กœ ๋ถ„ํ•  (8๊ฐœ ์ค‘ 4๊ฐœ์”ฉ ๋ถ„ํ• )
train_test_split = raw_dataset.train_test_split(test_size=0.5, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']
def tokenize_function(examples):
# ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ํ† ํฐํ™”ํ•˜๊ณ , ๊ฒฝ๋Ÿ‰ ๋ชจ๋ธ์— ๋งž๊ฒŒ max_length๋ฅผ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
# ๋ฐ์ดํ„ฐ์…‹์— ํ† ํฌ๋‚˜์ด์ € ์ ์šฉ ๋ฐ PyTorch ํ…์„œ ํ˜•์‹์œผ๋กœ ์ง€์ •
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True).with_format("torch")
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True).with_format("torch")
# 4. ํ‰๊ฐ€ ์ง€ํ‘œ ํ•จ์ˆ˜ ์ •์˜
def compute_metrics(p):
predictions = np.argmax(p.predictions, axis=1)
acc = accuracy_score(p.label_ids, predictions)
f1 = f1_score(p.label_ids, predictions, average='binary') # ๊ธ์ •(1)์— ๋Œ€ํ•œ F1-Score
return {"accuracy": acc, "f1": f1}
# 5. ํ•™์Šต ์„ค์ • (TrainingArguments)
OUTPUT_DIR = "./xtreme-distil-review-classifier" # ๋ชจ๋ธ ์ €์žฅ ๊ฒฝ๋กœ
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
num_train_epochs=5, # ๊ฒฝ๋Ÿ‰ ๋ชจ๋ธ์ด๋ฏ€๋กœ ์—ํฌํฌ ์ˆ˜๋ฅผ ์•ฝ๊ฐ„ ๋Š˜๋ ธ์Šต๋‹ˆ๋‹ค.
per_device_train_batch_size=8, # ๋ฐฐ์น˜ ํฌ๊ธฐ
per_device_eval_batch_size=8,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
# ํ‰๊ฐ€ ๋ฐ ์ €์žฅ ์ „๋žต์„ 'EPOCH'์œผ๋กœ ํ†ต์ผํ•˜์—ฌ load_best_model_at_end๋ฅผ ํ™œ์„ฑํ™”ํ•ฉ๋‹ˆ๋‹ค.
eval_strategy=IntervalStrategy.EPOCH,
save_strategy=IntervalStrategy.EPOCH,
load_best_model_at_end=True,
fp16=torch.cuda.is_available(),
)
# 6. Trainer ๊ฐ์ฒด ์ƒ์„ฑ ๋ฐ ํ•™์Šต ์‹œ์ž‘
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_eval_dataset,
compute_metrics=compute_metrics,
)
print("\n--- ํŒŒ์ธ ํŠœ๋‹ ์‹œ์ž‘ (XTREME-Distil ๋ชจ๋ธ) ---")
trainer.train()
# 7. ์ตœ์ข… ๋ชจ๋ธ ์ €์žฅ
print(f"\n--- ํŒŒ์ธ ํŠœ๋‹ ์™„๋ฃŒ, ๋ชจ๋ธ์„ {OUTPUT_DIR}์— ์ €์žฅ ์ค‘ ---")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("๋ชจ๋ธ ์ €์žฅ ์™„๋ฃŒ. ์ด์ œ ์ €์žฅ๋œ ๋ชจ๋ธ์„ ๋กœ๋“œํ•˜์—ฌ ๋ฐ”๋กœ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")