File size: 4,315 Bytes
899f482
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import torch
import numpy as np
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

# 1. GPU/CPU ์žฅ์น˜ ์„ค์ • (ํ•™์Šต ์‹œ Trainer๊ฐ€ ์ž๋™์œผ๋กœ ์ฒ˜๋ฆฌํ•˜๋ฏ€๋กœ ํ™•์ธ์šฉ)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"์‚ฌ์šฉ ์žฅ์น˜: {device}")

# 2. ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ
MODEL_NAME = "FacebookAI/xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# ๋ถ„๋ฅ˜๋ฅผ ์œ„ํ•ด AutoModelForSequenceClassification ๋กœ๋“œ (๋ถ„๋ฅ˜ ํ—ค๋“œ๊ฐ€ ์ถ”๊ฐ€๋จ)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) 
# num_labels=2: ๊ธ์ •(1), ๋ถ€์ •(0)์„ ๊ตฌ๋ถ„ํ•˜๋„๋ก ์„ค์ •

# 3. ๊ฐ€์ƒ ๋ฐ์ดํ„ฐ์…‹ ์ค€๋น„ ๋ฐ ์ „์ฒ˜๋ฆฌ
# ์‹ค์ œ ํ•™์Šต ์‹œ์—๋Š” ์—ฌ๊ธฐ์— ๋‹น์‹ ์˜ ๋‹น๊ทผ๋งˆ์ผ“ ๋ฆฌ๋ทฐ ๋ฐ์ดํ„ฐ๋ฅผ ๋กœ๋“œํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
data = {
    'text': [
        "๋งค๋„ˆ๊ฐ€ ์ •๋ง ์ข‹์œผ์„ธ์š”! ๊ธฐ๋ถ„ ์ข‹์€ ๊ฑฐ๋ž˜์˜€์Šต๋‹ˆ๋‹ค.",  # ๊ธ์ •
        "๋ฌผ๊ฑด ์ƒํƒœ๊ฐ€ ๋ณ„๋กœ๊ณ  ๋‹ต๋ณ€๋„ ๋„ˆ๋ฌด ๋А๋ ธ์–ด์š”.",      # ๋ถ€์ •
        "๋น ๋ฅธ ์‘๋‹ต๊ณผ ๊น”๋”ํ•œ ๊ฑฐ๋ž˜ ๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค.",          # ๊ธ์ •
        "๊ฐ€๊ฒฉ์ด ๋„ˆ๋ฌด ๋น„์‹ธ๋„ค์š”. ๋น„์ถ”์ž…๋‹ˆ๋‹ค.",          # ๋ถ€์ •
        "์˜ค๋Š˜๋„ ๋งŒ์กฑ์Šค๋Ÿฌ์šด ์ค‘๊ณ  ๊ฑฐ๋ž˜์˜€์Šต๋‹ˆ๋‹ค.",         # ๊ธ์ •
        "์‹œ๊ฐ„ ์•ฝ์† ์•ˆ ์ง€ํ‚ค๊ณ  ์—ฐ๋ฝ๋„ ์ž˜ ์•ˆ ๋˜๋„ค์š”.",     # ๋ถ€์ •
    ],
    'label': [1, 0, 1, 0, 1, 0] # 1: ๊ธ์ •, 0: ๋ถ€์ •
}
raw_dataset = Dataset.from_dict(data)

# ๋ฐ์ดํ„ฐ์…‹์„ ํ•™์Šต(train)๊ณผ ํ‰๊ฐ€(test) ์„ธํŠธ๋กœ ๋ถ„ํ•  (์˜ˆ์‹œ์ด๋ฏ€๋กœ 50:50)
train_test_split = raw_dataset.train_test_split(test_size=0.5, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

def tokenize_function(examples):
    # ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ํ† ํฐํ™”ํ•ฉ๋‹ˆ๋‹ค.
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# ๋ฐ์ดํ„ฐ์…‹์— ํ† ํฌ๋‚˜์ด์ € ์ ์šฉ
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# 4. ํ‰๊ฐ€ ์ง€ํ‘œ ํ•จ์ˆ˜ ์ •์˜
def compute_metrics(p):
    # ์˜ˆ์ธก๋œ ๋กœ์ง“(logits)์—์„œ argmax๋ฅผ ์ทจํ•ด ์˜ˆ์ธก ๋ ˆ์ด๋ธ”์„ ์–ป์Šต๋‹ˆ๋‹ค.
    predictions = np.argmax(p.predictions, axis=1)
    # ์ •ํ™•๋„(Accuracy)์™€ F1-Score๋ฅผ ๊ณ„์‚ฐํ•ฉ๋‹ˆ๋‹ค.
    acc = accuracy_score(p.label_ids, predictions)
    f1 = f1_score(p.label_ids, predictions, average='binary') # ๊ธ์ •(1)์— ๋Œ€ํ•œ F1-Score
    return {"accuracy": acc, "f1": f1}

# 5. ํ•™์Šต ์„ค์ • (TrainingArguments)
OUTPUT_DIR = "./xlm-roberta-review-classifier" # ๋ชจ๋ธ์„ ์ €์žฅํ•  ๊ฒฝ๋กœ
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,                     # ํ•™์Šต ํšŸ์ˆ˜ (์‹ค์ œ ์ž‘์—… ์‹œ 3~5ํšŒ ๊ถŒ์žฅ)
    per_device_train_batch_size=8,          # GPU๋‹น ํ•™์Šต ๋ฐฐ์น˜ ํฌ๊ธฐ (VRAM์— ๋”ฐ๋ผ ์กฐ์ •)
    per_device_eval_batch_size=8,           # GPU๋‹น ํ‰๊ฐ€ ๋ฐฐ์น˜ ํฌ๊ธฐ
    warmup_steps=500,                       # ํ•™์Šต๋ฅ ์ด ์ตœ๋Œ€์น˜์— ๋„๋‹ฌํ•˜๋Š” ๋‹จ๊ณ„ ์ˆ˜
    weight_decay=0.01,                      # ๊ฐ€์ค‘์น˜ ๊ฐ์†Œ (์˜ค๋ฒ„ํ”ผํŒ… ๋ฐฉ์ง€)
    logging_dir='./logs',                   # ๋กœ๊ทธ ์ €์žฅ ๊ฒฝ๋กœ
    logging_steps=10,
    eval_strategy="epoch",            # ์—ํฌํฌ๋งˆ๋‹ค ํ‰๊ฐ€ ์ˆ˜ํ–‰
    save_strategy="epoch",                  # ์—ํฌํฌ๋งˆ๋‹ค ๋ชจ๋ธ ์ €์žฅ
    load_best_model_at_end=True,            # ํ•™์Šต ์ข…๋ฃŒ ์‹œ ๊ฐ€์žฅ ์ข‹์€ ์„ฑ๋Šฅ์˜ ๋ชจ๋ธ ๋กœ๋“œ
    fp16=torch.cuda.is_available(),         # GPU ์‚ฌ์šฉ ์‹œ ์†๋„๋ฅผ ์œ„ํ•ด fp16 ์‚ฌ์šฉ
)

# 6. Trainer ๊ฐ์ฒด ์ƒ์„ฑ ๋ฐ ํ•™์Šต ์‹œ์ž‘
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
)

print("\n--- ํŒŒ์ธ ํŠœ๋‹ ์‹œ์ž‘ ---")
trainer.train()

# 7. ์ตœ์ข… ๋ชจ๋ธ ์ €์žฅ
# ํ•™์Šต๋œ ๋ชจ๋ธ๊ณผ ํ† ํฌ๋‚˜์ด์ €๋ฅผ ์ง€์ •๋œ ๊ฒฝ๋กœ์— ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
print(f"\n--- ํŒŒ์ธ ํŠœ๋‹ ์™„๋ฃŒ, ๋ชจ๋ธ์„ {OUTPUT_DIR}์— ์ €์žฅ ์ค‘ ---")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR) 

print("๋ชจ๋ธ ์ €์žฅ ์™„๋ฃŒ. ์ด์ œ ์ €์žฅ๋œ ๋ชจ๋ธ์„ ๋กœ๋“œํ•˜์—ฌ ๋ฐ”๋กœ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")