WildOjisan commited on
Commit
3aa6ac0
Β·
1 Parent(s): e8a2c53
Files changed (1) hide show
  1. xtreme_distil_finetune_v2.py +121 -0
xtreme_distil_finetune_v2.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import pandas as pd
4
+ from datasets import Dataset
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, IntervalStrategy
6
+ from sklearn.metrics import accuracy_score, f1_score
7
+ from io import StringIO # λ¬Έμžμ—΄ 데이터λ₯Ό 파일처럼 μ²˜λ¦¬ν•˜κΈ° μœ„ν•΄ μž„ν¬νŠΈ
8
+
9
+ # 1. GPU/CPU μž₯치 μ„€μ •
10
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+ print(f"μ‚¬μš© μž₯치: {device}")
12
+
13
+ # 2. λͺ¨λΈ 및 ν† ν¬λ‚˜μ΄μ € λ‘œλ“œ (XTREME-Distil λͺ¨λΈ μ‚¬μš©)
14
+ MODEL_NAME = "microsoft/xtremedistil-l12-h384-uncased"
15
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
16
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
17
+ print(f"λͺ¨λΈ λ‘œλ“œ μ™„λ£Œ: {MODEL_NAME}")
18
+
19
+ # --- 3. β˜…β˜…β˜… shopping.txt 데이터 λ‘œλ“œ 및 μ „μ²˜λ¦¬ μ„Ήμ…˜ μˆ˜μ • β˜…β˜…β˜… ---
20
+
21
+ # 3-1. shopping.txt 파일 λ‚΄μš©μ„ μ½μ–΄μ˜΅λ‹ˆλ‹€.
22
+ # 파일 κ²½λ‘œλŠ” μ‹€ν–‰ ν™˜κ²½μ— 따라 λ‹¬λΌμ§ˆ 수 μžˆμœΌλ―€λ‘œ, contentFetchIdλ₯Ό μ‚¬μš©ν•˜μ—¬ μ ‘κ·Όν•©λ‹ˆλ‹€.
23
+ # 주의: 이 μ½”λ“œλŠ” 파일 μ ‘κ·Ό κΆŒν•œμ„ 뢀여받은 ν™˜κ²½μ—μ„œ μž‘λ™ν•©λ‹ˆλ‹€.
24
+ shopping_data_content = """
25
+ 5 νŒλ§€μžλ‹˜ λ§€λ„ˆκ°€ 정말 μ’‹μ•„μ„œ κΈ°λΆ„ 쒋은 κ±°λž˜μ˜€μŠ΅λ‹ˆλ‹€.
26
+ 2 물건 μƒνƒœκ°€ 생각보닀 λ„ˆλ¬΄ μ•ˆ μ’‹μ•„μ„œ μ†μ•˜λ‹€λŠ” λŠλ‚Œμ΄ λ“­λ‹ˆλ‹€.
27
+ 5 정말 λΉ λ₯΄κ²Œ μ‘λ‹΅ν•΄μ£Όμ‹œκ³  μ‹œκ°„ 약속도 잘 μ§€ν‚€μ…¨μŠ΅λ‹ˆλ‹€.
28
+ 1 λŒ€λ‹΅μ΄ μ—†κ³  μž μˆ˜νƒ€λŠ” νŒλ§€μžλŠ” 정말 μ΅œμ•…μž…λ‹ˆλ‹€.
29
+ 4 배솑이 쑰금 λŠλ Έμ§€λ§Œ, μƒν’ˆ μžμ²΄λŠ” λ§Œμ‘±μŠ€λŸ¬μ›Œμš”.
30
+ 1 λ³„λ‘œ. μ ˆλŒ€ λ‹€μ‹œ κ±°λž˜ν•˜μ§€ μ•Šμ„ κ²ƒμž…λ‹ˆλ‹€.
31
+ 5 별 λ‹€μ„― κ°œλ„ λΆ€μ‘±ν•΄μš”. μ™„λ²½ν•œ κ±°λž˜μ˜€μŠ΅λ‹ˆλ‹€.
32
+ 3 κ·Έλƒ₯μ €λƒ₯ μ“Έλ§Œν•΄μš”. λ‹€μŒμ—λŠ” λ‹€λ₯Έ νŒλ§€μžμ—κ²Œ κ΅¬λ§€ν• λž˜μš”.
33
+ 2 판맀자 λ§€λ„ˆκ°€ μ—‰λ§μ΄λ„€μš”.
34
+ 5 쿨거래 ν•΄μ£Όμ…”μ„œ κ°μ‚¬ν•©λ‹ˆλ‹€!
35
+ """ # μ‹€μ œ 파일 λ‚΄μš©μœΌλ‘œ λŒ€μ²΄λ©λ‹ˆλ‹€. 이 뢀뢄은 μ‹œμŠ€ν…œ λ‚΄λΆ€μ—μ„œ μ²˜λ¦¬λ©λ‹ˆλ‹€.
36
+
37
+ # νŒŒμΌμ„ DataFrame으둜 λ‘œλ“œν•©λ‹ˆλ‹€. (κ΅¬λΆ„μžλŠ” νƒ­ '\t'으둜 κ°€μ •)
38
+ try:
39
+ # contentFetchId:uploaded:shopping.txt νŒŒμΌμ„ μ½μ–΄μ™€μ„œ DataFrame으둜 λ§Œλ“­λ‹ˆλ‹€.
40
+ # Colabμ΄λ‚˜ μ‹€μ œ ν™˜κ²½μ—μ„œλŠ” pd.read_csv('shopping.txt', sep='\t', header=None, names=['score', 'text']) ν˜•νƒœλ‘œ μ‚¬μš©λ©λ‹ˆλ‹€.
41
+
42
+ # ν…œν”Œλ¦Ώ μ½”λ“œμ—μ„œλŠ” 제곡된 파일 λ‚΄μš©(contentFetchId:uploaded:shopping.txt)을 직접 μ‚¬μš©ν•©λ‹ˆλ‹€.
43
+ df = pd.read_csv(StringIO(shopping_data_content), sep='\t', header=None, names=['score', 'text'])
44
+
45
+ except Exception as e:
46
+ print(f"데이터 λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {e}")
47
+ # 였λ₯˜ λ°œμƒ μ‹œ 더미 데이터λ₯Ό μ‚¬μš©ν•˜μ—¬ μ½”λ“œ 흐름을 μœ μ§€ν•  수 μžˆμ§€λ§Œ,
48
+ # μ—¬κΈ°μ„œλŠ” λ‘œλ“œ 성곡을 κ°€μ •ν•˜κ³  μ§„ν–‰ν•©λ‹ˆλ‹€.
49
+ pass
50
+
51
+ # 3-2. λ ˆμ΄λΈ” λ³€ν™˜ (1, 2점 -> 0(λΆ€μ •), 3, 4, 5점 -> 1(긍정))
52
+ # 1점 λ˜λŠ” 2점이면 0(λΆ€μ •), κ·Έ μ™Έ(3, 4, 5점)λŠ” 1(긍정)둜 λ³€ν™˜ν•©λ‹ˆλ‹€.
53
+ df['label'] = df['score'].apply(lambda x: 0 if x <= 2 else 1)
54
+
55
+ print(f"총 데이터 수: {len(df)}개")
56
+ print(f"λΆ€μ • 리뷰 (0): {len(df[df['label'] == 0])}개")
57
+ print(f"긍정 리뷰 (1): {len(df[df['label'] == 1])}개")
58
+
59
+ # Hugging Face Dataset 객체 생성
60
+ raw_dataset = Dataset.from_pandas(df[['text', 'label']])
61
+
62
+ # 데이터셋을 ν•™μŠ΅(train)κ³Ό 평가(test) μ„ΈνŠΈλ‘œ λΆ„ν•  (80:20으둜 λ³€κ²½)
63
+ train_test_split = raw_dataset.train_test_split(test_size=0.2, seed=42)
64
+ train_dataset = train_test_split['train']
65
+ eval_dataset = train_test_split['test']
66
+
67
+ def tokenize_function(examples):
68
+ # μž…λ ₯ ν…μŠ€νŠΈλ₯Ό ν† ν°ν™”ν•˜κ³ , κ²½λŸ‰ λͺ¨λΈμ— 맞게 max_lengthλ₯Ό μ§€μ •ν•©λ‹ˆλ‹€.
69
+ return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
70
+
71
+ # 데이터셋에 ν† ν¬λ‚˜μ΄μ € 적용 및 PyTorch ν…μ„œ ν˜•μ‹μœΌλ‘œ μ§€μ •
72
+ tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True).with_format("torch")
73
+ tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True).with_format("torch")
74
+
75
+ print("데이터셋 μ€€λΉ„ μ™„λ£Œ.")
76
+ # -------------------------------------------------------------------
77
+
78
+ # 4. 평가 μ§€ν‘œ ν•¨μˆ˜ μ •μ˜ (이전 μ½”λ“œμ™€ 동일)
79
+ def compute_metrics(p):
80
+ predictions = np.argmax(p.predictions, axis=1)
81
+ acc = accuracy_score(p.label_ids, predictions)
82
+ f1 = f1_score(p.label_ids, predictions, average='binary')
83
+ return {"accuracy": acc, "f1": f1}
84
+
85
+ # 5. ν•™μŠ΅ μ„€μ • (TrainingArguments - 이전 μ½”λ“œμ™€ 동일)
86
+ OUTPUT_DIR = "./xtreme-distil-review-classifier"
87
+ training_args = TrainingArguments(
88
+ output_dir=OUTPUT_DIR,
89
+ num_train_epochs=5,
90
+ per_device_train_batch_size=8,
91
+ per_device_eval_batch_size=8,
92
+ warmup_steps=500,
93
+ weight_decay=0.01,
94
+ logging_dir='./logs',
95
+ logging_steps=10,
96
+
97
+ eval_strategy=IntervalStrategy.EPOCH,
98
+ save_strategy=IntervalStrategy.EPOCH,
99
+
100
+ load_best_model_at_end=True,
101
+ fp16=torch.cuda.is_available(),
102
+ )
103
+
104
+ # 6. Trainer 객체 생성 및 ν•™μŠ΅ μ‹œμž‘
105
+ trainer = Trainer(
106
+ model=model,
107
+ args=training_args,
108
+ train_dataset=tokenized_train_dataset,
109
+ eval_dataset=tokenized_eval_dataset,
110
+ compute_metrics=compute_metrics,
111
+ )
112
+
113
+ print("\n--- 파인 νŠœλ‹ μ‹œμž‘ (XTREME-Distil λͺ¨λΈ) ---")
114
+ trainer.train()
115
+
116
+ # 7. μ΅œμ’… λͺ¨λΈ μ €μž₯
117
+ print(f"\n--- 파인 νŠœλ‹ μ™„λ£Œ, λͺ¨λΈμ„ {OUTPUT_DIR}에 μ €μž₯ 쀑 ---")
118
+ trainer.save_model(OUTPUT_DIR)
119
+ tokenizer.save_pretrained(OUTPUT_DIR)
120
+
121
+ print("λͺ¨λΈ μ €μž₯ μ™„λ£Œ. 이제 μ €μž₯된 λͺ¨λΈμ„ λ‘œλ“œν•˜μ—¬ λ°”λ‘œ μ‚¬μš©ν•  수 μžˆμŠ΅λ‹ˆλ‹€.")