File size: 4,148 Bytes
899f482
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import torch
import numpy as np
from datasets import Dataset
# IntervalStrategy๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ์ž„ํฌํŠธํ•˜์—ฌ ๋ฒ„์ „ ์ถฉ๋Œ์„ ๋ฐฉ์ง€ํ•ฉ๋‹ˆ๋‹ค.
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, IntervalStrategy
from sklearn.metrics import accuracy_score, f1_score

# 1. GPU/CPU ์žฅ์น˜ ์„ค์ •
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"์‚ฌ์šฉ ์žฅ์น˜: {device}")

# 2. ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ (๊ฒฝ๋Ÿ‰ ๋ชจ๋ธ ์‚ฌ์šฉ)
MODEL_NAME = "microsoft/xtremedistil-l12-h384-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# AutoModelForSequenceClassification์„ ๋กœ๋“œํ•˜์—ฌ ๋ถ„๋ฅ˜์ธต์„ ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) 
# num_labels=2: ์ด์ง„ ๋ถ„๋ฅ˜ (๊ธ์ •: 1, ๋ถ€์ •: 0)

# 3. ๊ฐ€์ƒ ๋ฐ์ดํ„ฐ์…‹ ์ค€๋น„ ๋ฐ ์ „์ฒ˜๋ฆฌ
# ์‹ค์ œ ์‚ฌ์šฉ ์‹œ์—๋Š” ์ด ๋ถ€๋ถ„์„ ๋‹น์‹ ์˜ ํ•œ๊ตญ์–ด ๋ฆฌ๋ทฐ ๋ฐ์ดํ„ฐ๋กœ ๋Œ€์ฒดํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
data = {
    'text': [
        "๋งค๋„ˆ๊ฐ€ ์ •๋ง ์ข‹์œผ์„ธ์š”! ๊ธฐ๋ถ„ ์ข‹์€ ๊ฑฐ๋ž˜์˜€์Šต๋‹ˆ๋‹ค.",
        "๋ฌผ๊ฑด ์ƒํƒœ๊ฐ€ ๋ณ„๋กœ๊ณ  ๋‹ต๋ณ€๋„ ๋„ˆ๋ฌด ๋А๋ ธ์–ด์š”.",
        "๋น ๋ฅธ ์‘๋‹ต๊ณผ ๊น”๋”ํ•œ ๊ฑฐ๋ž˜ ๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค.",
        "๊ฐ€๊ฒฉ์ด ๋„ˆ๋ฌด ๋น„์‹ธ๋„ค์š”. ๋น„์ถ”์ž…๋‹ˆ๋‹ค.",
        "์˜ค๋Š˜๋„ ๋งŒ์กฑ์Šค๋Ÿฌ์šด ์ค‘๊ณ  ๊ฑฐ๋ž˜์˜€์Šต๋‹ˆ๋‹ค.",
        "์‹œ๊ฐ„ ์•ฝ์† ์•ˆ ์ง€ํ‚ค๊ณ  ์—ฐ๋ฝ๋„ ์ž˜ ์•ˆ ๋˜๋„ค์š”.",
        "์นœ์ ˆํ•จ ๋•๋ถ„์— ๊ฑฐ๋ž˜ ๊ณผ์ •์ด ์ˆœ์กฐ๋กœ์› ์Šต๋‹ˆ๋‹ค.",
        "ํŒ๋งค๊ธ€๊ณผ ์‹ค์ œ ์ œํ’ˆ์ด ๋‹ฌ๋ผ์„œ ์‹ค๋งํ–ˆ์Šต๋‹ˆ๋‹ค.",
    ],
    'label': [1, 0, 1, 0, 1, 0, 1, 0] # 1: ๊ธ์ •, 0: ๋ถ€์ •
}
raw_dataset = Dataset.from_dict(data)

# ๋ฐ์ดํ„ฐ์…‹์„ ํ•™์Šต(train)๊ณผ ํ‰๊ฐ€(test) ์„ธํŠธ๋กœ ๋ถ„ํ•  (8๊ฐœ ์ค‘ 4๊ฐœ์”ฉ ๋ถ„ํ• )
train_test_split = raw_dataset.train_test_split(test_size=0.5, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

def tokenize_function(examples):
    # ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ํ† ํฐํ™”ํ•˜๊ณ , ๊ฒฝ๋Ÿ‰ ๋ชจ๋ธ์— ๋งž๊ฒŒ max_length๋ฅผ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# ๋ฐ์ดํ„ฐ์…‹์— ํ† ํฌ๋‚˜์ด์ € ์ ์šฉ ๋ฐ PyTorch ํ…์„œ ํ˜•์‹์œผ๋กœ ์ง€์ •
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True).with_format("torch")
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True).with_format("torch")

# 4. ํ‰๊ฐ€ ์ง€ํ‘œ ํ•จ์ˆ˜ ์ •์˜
def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, predictions)
    f1 = f1_score(p.label_ids, predictions, average='binary') # ๊ธ์ •(1)์— ๋Œ€ํ•œ F1-Score
    return {"accuracy": acc, "f1": f1}

# 5. ํ•™์Šต ์„ค์ • (TrainingArguments)
OUTPUT_DIR = "./xtreme-distil-review-classifier" # ๋ชจ๋ธ ์ €์žฅ ๊ฒฝ๋กœ
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,                     # ๊ฒฝ๋Ÿ‰ ๋ชจ๋ธ์ด๋ฏ€๋กœ ์—ํฌํฌ ์ˆ˜๋ฅผ ์•ฝ๊ฐ„ ๋Š˜๋ ธ์Šต๋‹ˆ๋‹ค.
    per_device_train_batch_size=8,          # ๋ฐฐ์น˜ ํฌ๊ธฐ
    per_device_eval_batch_size=8,           
    warmup_steps=500,                       
    weight_decay=0.01,                      
    logging_dir='./logs',                   
    logging_steps=10,
    
    # ํ‰๊ฐ€ ๋ฐ ์ €์žฅ ์ „๋žต์„ 'EPOCH'์œผ๋กœ ํ†ต์ผํ•˜์—ฌ load_best_model_at_end๋ฅผ ํ™œ์„ฑํ™”ํ•ฉ๋‹ˆ๋‹ค.
    eval_strategy=IntervalStrategy.EPOCH, 
    save_strategy=IntervalStrategy.EPOCH,       
    
    load_best_model_at_end=True,            
    fp16=torch.cuda.is_available(),         
)

# 6. Trainer ๊ฐ์ฒด ์ƒ์„ฑ ๋ฐ ํ•™์Šต ์‹œ์ž‘
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
)

print("\n--- ํŒŒ์ธ ํŠœ๋‹ ์‹œ์ž‘ (XTREME-Distil ๋ชจ๋ธ) ---")
trainer.train()

# 7. ์ตœ์ข… ๋ชจ๋ธ ์ €์žฅ
print(f"\n--- ํŒŒ์ธ ํŠœ๋‹ ์™„๋ฃŒ, ๋ชจ๋ธ์„ {OUTPUT_DIR}์— ์ €์žฅ ์ค‘ ---")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR) 

print("๋ชจ๋ธ ์ €์žฅ ์™„๋ฃŒ. ์ด์ œ ์ €์žฅ๋œ ๋ชจ๋ธ์„ ๋กœ๋“œํ•˜์—ฌ ๋ฐ”๋กœ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")