WildOjisan commited on
Commit
899f482
ยท
1 Parent(s): 5a004f0
.dockerignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ *.log
6
+ .git
7
+ .gitignore
8
+ .venv/
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ *.log
6
+ .venv/
7
+ xtreme-distil-review-classifier/
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile
2
+ FROM python:3.11-slim
3
+
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ libglib2.0-0 libgl1 && \
6
+ rm -rf /var/lib/apt/lists/*
7
+
8
+ # โœ… ์บ์‹œ/ํ† ํฐ ๊ฒฝ๋กœ๋ฅผ /data๋กœ ๊ฐ•์ œ
9
+ ENV HF_HOME=/data \
10
+ TRANSFORMERS_CACHE=/data/transformers \
11
+ HF_HUB_CACHE=/data/hub \
12
+ HF_HUB_DISABLE_TELEMETRY=1 \
13
+ TOKENIZERS_PARALLELISM=false \
14
+ PYTHONUNBUFFERED=1 \
15
+ PYTHONDONTWRITEBYTECODE=1
16
+
17
+ # โœ… ๋””๋ ‰ํ„ฐ๋ฆฌ ์ƒ์„ฑ + ํผ๋ฏธ์…˜(์“ฐ๊ธฐ ๊ฐ€๋Šฅ)
18
+ RUN mkdir -p /data/transformers /data/hub && chmod -R 777 /data
19
+
20
+ WORKDIR /app
21
+ COPY requirements.txt /app/requirements.txt
22
+ RUN pip install --no-cache-dir -r /app/requirements.txt
23
+ COPY . /app
24
+
25
+ EXPOSE 7860
26
+ CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860} --workers 1"]
README.md CHANGED
@@ -8,3 +8,15 @@ pinned: false
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
11
+
12
+ py -3.10 -m uv venv venv
13
+
14
+ .\venv\Scripts\Activate.ps1
15
+
16
+ ์œˆ๋„์šฐ์—์„œ CUDA ๋ฒ„์ „ ํ™•์ธ ๋ฐฉ๋ฒ•
17
+ nvcc --version
18
+
19
+ GPU ๋“œ๋ผ์ด๋ฒ„๊ฐ€ ์ง€์›ํ•˜๋Š” ์ตœ๋Œ€ CUDA ๋ฒ„์ „
20
+ nvidia-smi
21
+
22
+ uv pip install torch torchvision --index-url https://download.pytorch.org/whl/cu130
kobert_test.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from transformers import AutoModel
4
+ # KoBERT ์ „์šฉ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ (Hugging Face ํ† ํฌ๋‚˜์ด์ €์™€ ๋‹ค๋ฆ„)
5
+ from kobert_tokenizer import KoBERTTokenizer
6
+
7
+ # 1. GPU/CPU ์žฅ์น˜ ์„ค์ •
8
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
+ print(f"์‚ฌ์šฉ ์žฅ์น˜: {device}")
10
+
11
+ # 2. ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ (์ถ”๊ฐ€ ์ˆ˜์ •)
12
+ MODEL_NAME = "monologg/kobert"
13
+ # ํ† ํฌ๋‚˜์ด์ €๋ฅผ ๋กœ๋“œํ•  ๋•Œ 'monologg/kobert' ๋Œ€์‹ 
14
+ # SKT Brain์˜ ๊ณต์‹ ์ €์žฅ์†Œ ์ด๋ฆ„์ธ 'skt/kobert-base-v1'์„ ์‚ฌ์šฉํ•˜๋Š” ๊ฒƒ์ด ๋” ์•ˆ์ •์ ์ž…๋‹ˆ๋‹ค.
15
+ tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
16
+ model = AutoModel.from_pretrained(MODEL_NAME)
17
+
18
+ # ๋ชจ๋ธ์„ ์„ค์ •๋œ ์žฅ์น˜(GPU ๋˜๋Š” CPU)๋กœ ์ด๋™
19
+ model.to(device)
20
+
21
+ # 3. ์ž„๋ฒ ๋”ฉ(Embedding) ์ถ”์ถœ ํ•จ์ˆ˜ ์ •์˜
22
+ def get_kobert_embedding(text):
23
+ # ํ…์ŠคํŠธ ํ† ํฐํ™” ๋ฐ ์ž…๋ ฅ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜
24
+ inputs = tokenizer.batch_encode_plus(
25
+ [text], # ๋ฆฌ์ŠคํŠธ ํ˜•ํƒœ๋กœ ์ž…๋ ฅ
26
+ padding='max_length',
27
+ max_length=64, # ์ตœ๋Œ€ ๊ธธ์ด ์ง€์ • (ํ•„์š”์— ๋”ฐ๋ผ ์กฐ์ •)
28
+ truncation=True,
29
+ return_tensors="pt" # PyTorch ํ…์„œ๋กœ ๋ฐ˜ํ™˜
30
+ ).to(device)
31
+
32
+ # ๋ชจ๋ธ ์ถ”๋ก  (Inference)
33
+ with torch.no_grad():
34
+ # output์—๋Š” last_hidden_state (๊ฐ ํ† ํฐ์˜ ์ž„๋ฒ ๋”ฉ) ๋“ฑ์ด ํฌํ•จ๋ฉ๋‹ˆ๋‹ค.
35
+ outputs = model(**inputs)
36
+
37
+ # ๋ฌธ์žฅ ์ž„๋ฒ ๋”ฉ ์ถ”์ถœ: [CLS] ํ† ํฐ์˜ ์ž„๋ฒ ๋”ฉ์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
38
+ # last_hidden_state์˜ ์ฒซ ๋ฒˆ์งธ ํ† ํฐ (์ธ๋ฑ์Šค 0)์ด [CLS] ํ† ํฐ์ด๋ฉฐ, ์ „์ฒด ๋ฌธ์žฅ์„ ๋Œ€ํ‘œํ•ฉ๋‹ˆ๋‹ค.
39
+ # shape: (1, 768)
40
+ sentence_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
41
+
42
+ return sentence_embedding[0] # numpy ๋ฐฐ์—ด (768์ฐจ์›)๋กœ ๋ฐ˜ํ™˜
43
+
44
+ # 4. ๋‹น๊ทผ๋งˆ์ผ“ ๋ฆฌ๋ทฐ ์˜ˆ์ œ ์‹คํ–‰
45
+ review_sentences = [
46
+ "ํŒ๋งค์ž๋‹˜ ๋งค๋„ˆ๊ฐ€ ๋„ˆ๋ฌด ์ข‹์•„์„œ ๊ธฐ๋ถ„ ์ข‹์€ ๊ฑฐ๋ž˜์˜€์Šต๋‹ˆ๋‹ค.",
47
+ "๋ฌผ๊ฑด ์ƒํƒœ๊ฐ€ ์ƒ๊ฐ๋ณด๋‹ค ๋ณ„๋กœ์—ฌ์„œ ์•„์‰ฝ๋„ค์š”. ๋‹ค์Œ์—” ๊ฑฐ๋ž˜ ์•ˆ ํ•  ๊ฒƒ ๊ฐ™์•„์š”.",
48
+ "์ด ์ž์ „๊ฑฐ ๋ชจ๋ธ์€ ์ค‘๊ณ  ์‹œ์„ธ๊ฐ€ ์–ด๋А ์ •๋„์ผ๊นŒ์š”?",
49
+ ]
50
+
51
+ print("\n--- KoBERT ์ž„๋ฒ ๋”ฉ ์ถ”์ถœ ๊ฒฐ๊ณผ ---")
52
+ for sentence in review_sentences:
53
+ embedding = get_kobert_embedding(sentence)
54
+
55
+ print(f"๋ฌธ์žฅ: '{sentence}'")
56
+ print(f" -> ์ž„๋ฒ ๋”ฉ ์ฐจ์›: {embedding.shape}") # 768์ฐจ์›
57
+ print(f" -> ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ์ผ๋ถ€ (์ฒซ 5๊ฐœ): {embedding[:5].round(4)}")
58
+ print("-" * 30)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ numpy
4
+ scikit-learn
5
+ datasets
6
+ accelerate
requirements_bk.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ numpy
3
+ scikit-learn
4
+ datasets
5
+ accelerate
roberta_finetune.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from datasets import load_dataset, Dataset
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
5
+ from sklearn.metrics import accuracy_score, f1_score
6
+
7
+ # 1. GPU/CPU ์žฅ์น˜ ์„ค์ • (ํ•™์Šต ์‹œ Trainer๊ฐ€ ์ž๋™์œผ๋กœ ์ฒ˜๋ฆฌํ•˜๋ฏ€๋กœ ํ™•์ธ์šฉ)
8
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
+ print(f"์‚ฌ์šฉ ์žฅ์น˜: {device}")
10
+
11
+ # 2. ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ
12
+ MODEL_NAME = "FacebookAI/xlm-roberta-base"
13
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
14
+ # ๋ถ„๋ฅ˜๋ฅผ ์œ„ํ•ด AutoModelForSequenceClassification ๋กœ๋“œ (๋ถ„๋ฅ˜ ํ—ค๋“œ๊ฐ€ ์ถ”๊ฐ€๋จ)
15
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
16
+ # num_labels=2: ๊ธ์ •(1), ๋ถ€์ •(0)์„ ๊ตฌ๋ถ„ํ•˜๋„๋ก ์„ค์ •
17
+
18
+ # 3. ๊ฐ€์ƒ ๋ฐ์ดํ„ฐ์…‹ ์ค€๋น„ ๋ฐ ์ „์ฒ˜๋ฆฌ
19
+ # ์‹ค์ œ ํ•™์Šต ์‹œ์—๋Š” ์—ฌ๊ธฐ์— ๋‹น์‹ ์˜ ๋‹น๊ทผ๋งˆ์ผ“ ๋ฆฌ๋ทฐ ๋ฐ์ดํ„ฐ๋ฅผ ๋กœ๋“œํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
20
+ data = {
21
+ 'text': [
22
+ "๋งค๋„ˆ๊ฐ€ ์ •๋ง ์ข‹์œผ์„ธ์š”! ๊ธฐ๋ถ„ ์ข‹์€ ๊ฑฐ๋ž˜์˜€์Šต๋‹ˆ๋‹ค.", # ๊ธ์ •
23
+ "๋ฌผ๊ฑด ์ƒํƒœ๊ฐ€ ๋ณ„๋กœ๊ณ  ๋‹ต๋ณ€๋„ ๋„ˆ๋ฌด ๋А๋ ธ์–ด์š”.", # ๋ถ€์ •
24
+ "๋น ๋ฅธ ์‘๋‹ต๊ณผ ๊น”๋”ํ•œ ๊ฑฐ๋ž˜ ๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค.", # ๊ธ์ •
25
+ "๊ฐ€๊ฒฉ์ด ๋„ˆ๋ฌด ๋น„์‹ธ๋„ค์š”. ๋น„์ถ”์ž…๋‹ˆ๋‹ค.", # ๋ถ€์ •
26
+ "์˜ค๋Š˜๋„ ๋งŒ์กฑ์Šค๋Ÿฌ์šด ์ค‘๊ณ  ๊ฑฐ๋ž˜์˜€์Šต๋‹ˆ๋‹ค.", # ๊ธ์ •
27
+ "์‹œ๊ฐ„ ์•ฝ์† ์•ˆ ์ง€ํ‚ค๊ณ  ์—ฐ๋ฝ๋„ ์ž˜ ์•ˆ ๋˜๋„ค์š”.", # ๋ถ€์ •
28
+ ],
29
+ 'label': [1, 0, 1, 0, 1, 0] # 1: ๊ธ์ •, 0: ๋ถ€์ •
30
+ }
31
+ raw_dataset = Dataset.from_dict(data)
32
+
33
+ # ๋ฐ์ดํ„ฐ์…‹์„ ํ•™์Šต(train)๊ณผ ํ‰๊ฐ€(test) ์„ธํŠธ๋กœ ๋ถ„ํ•  (์˜ˆ์‹œ์ด๋ฏ€๋กœ 50:50)
34
+ train_test_split = raw_dataset.train_test_split(test_size=0.5, seed=42)
35
+ train_dataset = train_test_split['train']
36
+ eval_dataset = train_test_split['test']
37
+
38
+ def tokenize_function(examples):
39
+ # ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ํ† ํฐํ™”ํ•ฉ๋‹ˆ๋‹ค.
40
+ return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
41
+
42
+ # ๋ฐ์ดํ„ฐ์…‹์— ํ† ํฌ๋‚˜์ด์ € ์ ์šฉ
43
+ tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
44
+ tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)
45
+
46
+ # 4. ํ‰๊ฐ€ ์ง€ํ‘œ ํ•จ์ˆ˜ ์ •์˜
47
+ def compute_metrics(p):
48
+ # ์˜ˆ์ธก๋œ ๋กœ์ง“(logits)์—์„œ argmax๋ฅผ ์ทจํ•ด ์˜ˆ์ธก ๋ ˆ์ด๋ธ”์„ ์–ป์Šต๋‹ˆ๋‹ค.
49
+ predictions = np.argmax(p.predictions, axis=1)
50
+ # ์ •ํ™•๋„(Accuracy)์™€ F1-Score๋ฅผ ๊ณ„์‚ฐํ•ฉ๋‹ˆ๋‹ค.
51
+ acc = accuracy_score(p.label_ids, predictions)
52
+ f1 = f1_score(p.label_ids, predictions, average='binary') # ๊ธ์ •(1)์— ๋Œ€ํ•œ F1-Score
53
+ return {"accuracy": acc, "f1": f1}
54
+
55
+ # 5. ํ•™์Šต ์„ค์ • (TrainingArguments)
56
+ OUTPUT_DIR = "./xlm-roberta-review-classifier" # ๋ชจ๋ธ์„ ์ €์žฅํ•  ๊ฒฝ๋กœ
57
+ training_args = TrainingArguments(
58
+ output_dir=OUTPUT_DIR,
59
+ num_train_epochs=3, # ํ•™์Šต ํšŸ์ˆ˜ (์‹ค์ œ ์ž‘์—… ์‹œ 3~5ํšŒ ๊ถŒ์žฅ)
60
+ per_device_train_batch_size=8, # GPU๋‹น ํ•™์Šต ๋ฐฐ์น˜ ํฌ๊ธฐ (VRAM์— ๋”ฐ๋ผ ์กฐ์ •)
61
+ per_device_eval_batch_size=8, # GPU๋‹น ํ‰๊ฐ€ ๋ฐฐ์น˜ ํฌ๊ธฐ
62
+ warmup_steps=500, # ํ•™์Šต๋ฅ ์ด ์ตœ๋Œ€์น˜์— ๋„๋‹ฌํ•˜๋Š” ๋‹จ๊ณ„ ์ˆ˜
63
+ weight_decay=0.01, # ๊ฐ€์ค‘์น˜ ๊ฐ์†Œ (์˜ค๋ฒ„ํ”ผํŒ… ๋ฐฉ์ง€)
64
+ logging_dir='./logs', # ๋กœ๊ทธ ์ €์žฅ ๊ฒฝ๋กœ
65
+ logging_steps=10,
66
+ eval_strategy="epoch", # ์—ํฌํฌ๋งˆ๋‹ค ํ‰๊ฐ€ ์ˆ˜ํ–‰
67
+ save_strategy="epoch", # ์—ํฌํฌ๋งˆ๋‹ค ๋ชจ๋ธ ์ €์žฅ
68
+ load_best_model_at_end=True, # ํ•™์Šต ์ข…๋ฃŒ ์‹œ ๊ฐ€์žฅ ์ข‹์€ ์„ฑ๋Šฅ์˜ ๋ชจ๋ธ ๋กœ๋“œ
69
+ fp16=torch.cuda.is_available(), # GPU ์‚ฌ์šฉ ์‹œ ์†๋„๋ฅผ ์œ„ํ•ด fp16 ์‚ฌ์šฉ
70
+ )
71
+
72
+ # 6. Trainer ๊ฐ์ฒด ์ƒ์„ฑ ๋ฐ ํ•™์Šต ์‹œ์ž‘
73
+ trainer = Trainer(
74
+ model=model,
75
+ args=training_args,
76
+ train_dataset=tokenized_train_dataset,
77
+ eval_dataset=tokenized_eval_dataset,
78
+ compute_metrics=compute_metrics,
79
+ )
80
+
81
+ print("\n--- ํŒŒ์ธ ํŠœ๋‹ ์‹œ์ž‘ ---")
82
+ trainer.train()
83
+
84
+ # 7. ์ตœ์ข… ๋ชจ๋ธ ์ €์žฅ
85
+ # ํ•™์Šต๋œ ๋ชจ๋ธ๊ณผ ํ† ํฌ๋‚˜์ด์ €๋ฅผ ์ง€์ •๋œ ๊ฒฝ๋กœ์— ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
86
+ print(f"\n--- ํŒŒ์ธ ํŠœ๋‹ ์™„๋ฃŒ, ๋ชจ๋ธ์„ {OUTPUT_DIR}์— ์ €์žฅ ์ค‘ ---")
87
+ trainer.save_model(OUTPUT_DIR)
88
+ tokenizer.save_pretrained(OUTPUT_DIR)
89
+
90
+ print("๋ชจ๋ธ ์ €์žฅ ์™„๋ฃŒ. ์ด์ œ ์ €์žฅ๋œ ๋ชจ๋ธ์„ ๋กœ๋“œํ•˜์—ฌ ๋ฐ”๋กœ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
roberta_test.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModel
3
+ import numpy as np
4
+
5
+ # 1. GPU/CPU ์žฅ์น˜ ์„ค์ •
6
+ # CUDA (GPU) ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•˜๋ฉด 'cuda', ์•„๋‹ˆ๋ฉด 'cpu'๋กœ ์„ค์ •ํ•ฉ๋‹ˆ๋‹ค.
7
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
+ print(f"์‚ฌ์šฉ ์žฅ์น˜: {device}")
9
+
10
+ # 2. ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ
11
+ # XLM-RoBERTa-base๋Š” Sequence Classification์ด ์•„๋‹Œ, ์ผ๋ฐ˜ ์ž„๋ฒ ๋”ฉ ์ถ”์ถœ ๋ชจ๋ธ๋กœ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
12
+ MODEL_NAME = "FacebookAI/xlm-roberta-base"
13
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
14
+ model = AutoModel.from_pretrained(MODEL_NAME)
15
+
16
+ # ๋ชจ๋ธ์„ ์„ค์ •๋œ ์žฅ์น˜(GPU ๋˜๋Š” CPU)๋กœ ์ด๋™
17
+ model.to(device)
18
+
19
+ # 3. ์ž„๋ฒ ๋”ฉ(Embedding) ์ถ”์ถœ ํ•จ์ˆ˜ ์ •์˜
20
+ def get_text_embedding(text):
21
+ # ํ…์ŠคํŠธ๋ฅผ ํ† ํฐํ™”ํ•˜๊ณ  ์žฅ์น˜๋กœ ์ด๋™
22
+ inputs = tokenizer(
23
+ text,
24
+ return_tensors="pt", # PyTorch ํ…์„œ๋กœ ๋ฐ˜ํ™˜
25
+ padding=True,
26
+ truncation=True
27
+ ).to(device)
28
+
29
+ # ๋ชจ๋ธ ์ถ”๋ก  (Inference)
30
+ with torch.no_grad():
31
+ # output์—๋Š” last_hidden_state (๊ฐ ํ† ํฐ์˜ ์ž„๋ฒ ๋”ฉ) ๋“ฑ์ด ํฌํ•จ๋ฉ๋‹ˆ๋‹ค.
32
+ outputs = model(**inputs)
33
+
34
+ # ๋ฌธ์žฅ ์ž„๋ฒ ๋”ฉ ์ถ”์ถœ: [CLS] ํ† ํฐ์˜ ์ž„๋ฒ ๋”ฉ์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
35
+ # last_hidden_state์˜ ์ฒซ ๋ฒˆ์งธ ํ† ํฐ (์ธ๋ฑ์Šค 0)์ด [CLS] ํ† ํฐ์ด๋ฉฐ, ์ „์ฒด ๋ฌธ์žฅ์„ ๋Œ€ํ‘œํ•ฉ๋‹ˆ๋‹ค.
36
+ # shape: (1, 768)
37
+ sentence_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
38
+
39
+ return sentence_embedding[0] # numpy ๋ฐฐ์—ด๋กœ ๋ฐ˜ํ™˜
40
+
41
+ # 4. ๋‹น๊ทผ๋งˆ์ผ“ ๋ฆฌ๋ทฐ ์˜ˆ์ œ ์‹คํ–‰
42
+ review_sentences = [
43
+ "๋งค๋„ˆ๊ฐ€ ์ •๋ง ์ข‹์œผ์‹œ๊ณ  ๋ฌผ๊ฑด๋„ ๊นจ๋—ํ•ด์„œ ๋งŒ์กฑ์Šค๋Ÿฌ์› ์–ด์š”.",
44
+ "์ด๊ฑด ์ข€ ์•„๋‹Œ๋“ฏ. ๋ฌผ๊ฑด ์ƒํƒœ๋„ ๋ณ„๋กœ๊ณ  ๋‹ต๋ณ€๋„ ๋А๋ ธ์Šต๋‹ˆ๋‹ค.",
45
+ "์ด ๋ชจ๋ธ์˜ ์ค‘๊ณ  ์‹œ์„ธ๋Š” ์–ผ๋งˆ์ธ๊ฐ€์š”?", # ์ผ๋ฐ˜์ ์ธ ์งˆ๋ฌธ ๋ฌธ์žฅ
46
+ "This is a great product for the price." # ์™ธ๊ตญ์–ด ๋ฌธ์žฅ๋„ ์ฒ˜๋ฆฌ ๊ฐ€๋Šฅ
47
+ ]
48
+
49
+ print("\n--- XLM-RoBERTa ์ž„๋ฒ ๋”ฉ ์ถ”์ถœ ๊ฒฐ๊ณผ ---")
50
+ for sentence in review_sentences:
51
+ embedding = get_text_embedding(sentence)
52
+
53
+ print(f"๋ฌธ์žฅ: '{sentence}'")
54
+ print(f" -> ์ž„๋ฒ ๋”ฉ ์ฐจ์›: {embedding.shape}") # 768์ฐจ์›
55
+ print(f" -> ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ์ผ๋ถ€ (์ฒซ 5๊ฐœ): {embedding[:5].round(4)}")
56
+ print("-" * 20)
xtreme_distil_finetine.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from datasets import Dataset
4
+ # IntervalStrategy๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ์ž„ํฌํŠธํ•˜์—ฌ ๋ฒ„์ „ ์ถฉ๋Œ์„ ๋ฐฉ์ง€ํ•ฉ๋‹ˆ๋‹ค.
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, IntervalStrategy
6
+ from sklearn.metrics import accuracy_score, f1_score
7
+
8
+ # 1. GPU/CPU ์žฅ์น˜ ์„ค์ •
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+ print(f"์‚ฌ์šฉ ์žฅ์น˜: {device}")
11
+
12
+ # 2. ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ (๊ฒฝ๋Ÿ‰ ๋ชจ๋ธ ์‚ฌ์šฉ)
13
+ MODEL_NAME = "microsoft/xtremedistil-l12-h384-uncased"
14
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
15
+ # AutoModelForSequenceClassification์„ ๋กœ๋“œํ•˜์—ฌ ๋ถ„๋ฅ˜์ธต์„ ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.
16
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
17
+ # num_labels=2: ์ด์ง„ ๋ถ„๋ฅ˜ (๊ธ์ •: 1, ๋ถ€์ •: 0)
18
+
19
+ # 3. ๊ฐ€์ƒ ๋ฐ์ดํ„ฐ์…‹ ์ค€๋น„ ๋ฐ ์ „์ฒ˜๋ฆฌ
20
+ # ์‹ค์ œ ์‚ฌ์šฉ ์‹œ์—๋Š” ์ด ๋ถ€๋ถ„์„ ๋‹น์‹ ์˜ ํ•œ๊ตญ์–ด ๋ฆฌ๋ทฐ ๋ฐ์ดํ„ฐ๋กœ ๋Œ€์ฒดํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
21
+ data = {
22
+ 'text': [
23
+ "๋งค๋„ˆ๊ฐ€ ์ •๋ง ์ข‹์œผ์„ธ์š”! ๊ธฐ๋ถ„ ์ข‹์€ ๊ฑฐ๋ž˜์˜€์Šต๋‹ˆ๋‹ค.",
24
+ "๋ฌผ๊ฑด ์ƒํƒœ๊ฐ€ ๋ณ„๋กœ๊ณ  ๋‹ต๋ณ€๋„ ๋„ˆ๋ฌด ๋А๋ ธ์–ด์š”.",
25
+ "๋น ๋ฅธ ์‘๋‹ต๊ณผ ๊น”๋”ํ•œ ๊ฑฐ๋ž˜ ๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค.",
26
+ "๊ฐ€๊ฒฉ์ด ๋„ˆ๋ฌด ๋น„์‹ธ๋„ค์š”. ๋น„์ถ”์ž…๋‹ˆ๋‹ค.",
27
+ "์˜ค๋Š˜๋„ ๋งŒ์กฑ์Šค๋Ÿฌ์šด ์ค‘๊ณ  ๊ฑฐ๋ž˜์˜€์Šต๋‹ˆ๋‹ค.",
28
+ "์‹œ๊ฐ„ ์•ฝ์† ์•ˆ ์ง€ํ‚ค๊ณ  ์—ฐ๋ฝ๋„ ์ž˜ ์•ˆ ๋˜๋„ค์š”.",
29
+ "์นœ์ ˆํ•จ ๋•๋ถ„์— ๊ฑฐ๋ž˜ ๊ณผ์ •์ด ์ˆœ์กฐ๋กœ์› ์Šต๋‹ˆ๋‹ค.",
30
+ "ํŒ๋งค๊ธ€๊ณผ ์‹ค์ œ ์ œํ’ˆ์ด ๋‹ฌ๋ผ์„œ ์‹ค๋งํ–ˆ์Šต๋‹ˆ๋‹ค.",
31
+ ],
32
+ 'label': [1, 0, 1, 0, 1, 0, 1, 0] # 1: ๊ธ์ •, 0: ๋ถ€์ •
33
+ }
34
+ raw_dataset = Dataset.from_dict(data)
35
+
36
+ # ๋ฐ์ดํ„ฐ์…‹์„ ํ•™์Šต(train)๊ณผ ํ‰๊ฐ€(test) ์„ธํŠธ๋กœ ๋ถ„ํ•  (8๊ฐœ ์ค‘ 4๊ฐœ์”ฉ ๋ถ„ํ• )
37
+ train_test_split = raw_dataset.train_test_split(test_size=0.5, seed=42)
38
+ train_dataset = train_test_split['train']
39
+ eval_dataset = train_test_split['test']
40
+
41
+ def tokenize_function(examples):
42
+ # ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ํ† ํฐํ™”ํ•˜๊ณ , ๊ฒฝ๋Ÿ‰ ๋ชจ๋ธ์— ๋งž๊ฒŒ max_length๋ฅผ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
43
+ return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
44
+
45
+ # ๋ฐ์ดํ„ฐ์…‹์— ํ† ํฌ๋‚˜์ด์ € ์ ์šฉ ๋ฐ PyTorch ํ…์„œ ํ˜•์‹์œผ๋กœ ์ง€์ •
46
+ tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True).with_format("torch")
47
+ tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True).with_format("torch")
48
+
49
+ # 4. ํ‰๊ฐ€ ์ง€ํ‘œ ํ•จ์ˆ˜ ์ •์˜
50
+ def compute_metrics(p):
51
+ predictions = np.argmax(p.predictions, axis=1)
52
+ acc = accuracy_score(p.label_ids, predictions)
53
+ f1 = f1_score(p.label_ids, predictions, average='binary') # ๊ธ์ •(1)์— ๋Œ€ํ•œ F1-Score
54
+ return {"accuracy": acc, "f1": f1}
55
+
56
+ # 5. ํ•™์Šต ์„ค์ • (TrainingArguments)
57
+ OUTPUT_DIR = "./xtreme-distil-review-classifier" # ๋ชจ๋ธ ์ €์žฅ ๊ฒฝ๋กœ
58
+ training_args = TrainingArguments(
59
+ output_dir=OUTPUT_DIR,
60
+ num_train_epochs=5, # ๊ฒฝ๋Ÿ‰ ๋ชจ๋ธ์ด๋ฏ€๋กœ ์—ํฌํฌ ์ˆ˜๋ฅผ ์•ฝ๊ฐ„ ๋Š˜๋ ธ์Šต๋‹ˆ๋‹ค.
61
+ per_device_train_batch_size=8, # ๋ฐฐ์น˜ ํฌ๊ธฐ
62
+ per_device_eval_batch_size=8,
63
+ warmup_steps=500,
64
+ weight_decay=0.01,
65
+ logging_dir='./logs',
66
+ logging_steps=10,
67
+
68
+ # ํ‰๊ฐ€ ๋ฐ ์ €์žฅ ์ „๋žต์„ 'EPOCH'์œผ๋กœ ํ†ต์ผํ•˜์—ฌ load_best_model_at_end๋ฅผ ํ™œ์„ฑํ™”ํ•ฉ๋‹ˆ๋‹ค.
69
+ eval_strategy=IntervalStrategy.EPOCH,
70
+ save_strategy=IntervalStrategy.EPOCH,
71
+
72
+ load_best_model_at_end=True,
73
+ fp16=torch.cuda.is_available(),
74
+ )
75
+
76
+ # 6. Trainer ๊ฐ์ฒด ์ƒ์„ฑ ๋ฐ ํ•™์Šต ์‹œ์ž‘
77
+ trainer = Trainer(
78
+ model=model,
79
+ args=training_args,
80
+ train_dataset=tokenized_train_dataset,
81
+ eval_dataset=tokenized_eval_dataset,
82
+ compute_metrics=compute_metrics,
83
+ )
84
+
85
+ print("\n--- ํŒŒ์ธ ํŠœ๋‹ ์‹œ์ž‘ (XTREME-Distil ๋ชจ๋ธ) ---")
86
+ trainer.train()
87
+
88
+ # 7. ์ตœ์ข… ๋ชจ๋ธ ์ €์žฅ
89
+ print(f"\n--- ํŒŒ์ธ ํŠœ๋‹ ์™„๋ฃŒ, ๋ชจ๋ธ์„ {OUTPUT_DIR}์— ์ €์žฅ ์ค‘ ---")
90
+ trainer.save_model(OUTPUT_DIR)
91
+ tokenizer.save_pretrained(OUTPUT_DIR)
92
+
93
+ print("๋ชจ๋ธ ์ €์žฅ ์™„๋ฃŒ. ์ด์ œ ์ €์žฅ๋œ ๋ชจ๋ธ์„ ๋กœ๋“œํ•˜์—ฌ ๋ฐ”๋กœ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")