Spaces:
Runtime error
Runtime error
Commit
ยท
899f482
1
Parent(s):
5a004f0
- .dockerignore +8 -0
- .gitignore +7 -0
- Dockerfile +26 -0
- README.md +12 -0
- kobert_test.py +58 -0
- requirements.txt +6 -0
- requirements_bk.txt +5 -0
- roberta_finetune.py +90 -0
- roberta_test.py +56 -0
- xtreme_distil_finetine.py +93 -0
.dockerignore
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
*.log
|
| 6 |
+
.git
|
| 7 |
+
.gitignore
|
| 8 |
+
.venv/
|
.gitignore
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
*.log
|
| 6 |
+
.venv/
|
| 7 |
+
xtreme-distil-review-classifier/
|
Dockerfile
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dockerfile
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 5 |
+
libglib2.0-0 libgl1 && \
|
| 6 |
+
rm -rf /var/lib/apt/lists/*
|
| 7 |
+
|
| 8 |
+
# โ
์บ์/ํ ํฐ ๊ฒฝ๋ก๋ฅผ /data๋ก ๊ฐ์
|
| 9 |
+
ENV HF_HOME=/data \
|
| 10 |
+
TRANSFORMERS_CACHE=/data/transformers \
|
| 11 |
+
HF_HUB_CACHE=/data/hub \
|
| 12 |
+
HF_HUB_DISABLE_TELEMETRY=1 \
|
| 13 |
+
TOKENIZERS_PARALLELISM=false \
|
| 14 |
+
PYTHONUNBUFFERED=1 \
|
| 15 |
+
PYTHONDONTWRITEBYTECODE=1
|
| 16 |
+
|
| 17 |
+
# โ
๋๋ ํฐ๋ฆฌ ์์ฑ + ํผ๋ฏธ์
(์ฐ๊ธฐ ๊ฐ๋ฅ)
|
| 18 |
+
RUN mkdir -p /data/transformers /data/hub && chmod -R 777 /data
|
| 19 |
+
|
| 20 |
+
WORKDIR /app
|
| 21 |
+
COPY requirements.txt /app/requirements.txt
|
| 22 |
+
RUN pip install --no-cache-dir -r /app/requirements.txt
|
| 23 |
+
COPY . /app
|
| 24 |
+
|
| 25 |
+
EXPOSE 7860
|
| 26 |
+
CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT:-7860} --workers 1"]
|
README.md
CHANGED
|
@@ -8,3 +8,15 @@ pinned: false
|
|
| 8 |
---
|
| 9 |
|
| 10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 11 |
+
|
| 12 |
+
py -3.10 -m uv venv venv
|
| 13 |
+
|
| 14 |
+
.\venv\Scripts\Activate.ps1
|
| 15 |
+
|
| 16 |
+
์๋์ฐ์์ CUDA ๋ฒ์ ํ์ธ ๋ฐฉ๋ฒ
|
| 17 |
+
nvcc --version
|
| 18 |
+
|
| 19 |
+
GPU ๋๋ผ์ด๋ฒ๊ฐ ์ง์ํ๋ ์ต๋ CUDA ๋ฒ์
|
| 20 |
+
nvidia-smi
|
| 21 |
+
|
| 22 |
+
uv pip install torch torchvision --index-url https://download.pytorch.org/whl/cu130
|
kobert_test.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import numpy as np
|
| 3 |
+
from transformers import AutoModel
|
| 4 |
+
# KoBERT ์ ์ฉ ํ ํฌ๋์ด์ ๋ก๋ (Hugging Face ํ ํฌ๋์ด์ ์ ๋ค๋ฆ)
|
| 5 |
+
from kobert_tokenizer import KoBERTTokenizer
|
| 6 |
+
|
| 7 |
+
# 1. GPU/CPU ์ฅ์น ์ค์
|
| 8 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 9 |
+
print(f"์ฌ์ฉ ์ฅ์น: {device}")
|
| 10 |
+
|
| 11 |
+
# 2. ๋ชจ๋ธ ๋ฐ ํ ํฌ๋์ด์ ๋ก๋ (์ถ๊ฐ ์์ )
|
| 12 |
+
MODEL_NAME = "monologg/kobert"
|
| 13 |
+
# ํ ํฌ๋์ด์ ๋ฅผ ๋ก๋ํ ๋ 'monologg/kobert' ๋์
|
| 14 |
+
# SKT Brain์ ๊ณต์ ์ ์ฅ์ ์ด๋ฆ์ธ 'skt/kobert-base-v1'์ ์ฌ์ฉํ๋ ๊ฒ์ด ๋ ์์ ์ ์
๋๋ค.
|
| 15 |
+
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
|
| 16 |
+
model = AutoModel.from_pretrained(MODEL_NAME)
|
| 17 |
+
|
| 18 |
+
# ๋ชจ๋ธ์ ์ค์ ๋ ์ฅ์น(GPU ๋๋ CPU)๋ก ์ด๋
|
| 19 |
+
model.to(device)
|
| 20 |
+
|
| 21 |
+
# 3. ์๋ฒ ๋ฉ(Embedding) ์ถ์ถ ํจ์ ์ ์
|
| 22 |
+
def get_kobert_embedding(text):
|
| 23 |
+
# ํ
์คํธ ํ ํฐํ ๋ฐ ์
๋ ฅ ํ์์ผ๋ก ๋ณํ
|
| 24 |
+
inputs = tokenizer.batch_encode_plus(
|
| 25 |
+
[text], # ๋ฆฌ์คํธ ํํ๋ก ์
๋ ฅ
|
| 26 |
+
padding='max_length',
|
| 27 |
+
max_length=64, # ์ต๋ ๊ธธ์ด ์ง์ (ํ์์ ๋ฐ๋ผ ์กฐ์ )
|
| 28 |
+
truncation=True,
|
| 29 |
+
return_tensors="pt" # PyTorch ํ
์๋ก ๋ฐํ
|
| 30 |
+
).to(device)
|
| 31 |
+
|
| 32 |
+
# ๋ชจ๋ธ ์ถ๋ก (Inference)
|
| 33 |
+
with torch.no_grad():
|
| 34 |
+
# output์๋ last_hidden_state (๊ฐ ํ ํฐ์ ์๋ฒ ๋ฉ) ๋ฑ์ด ํฌํจ๋ฉ๋๋ค.
|
| 35 |
+
outputs = model(**inputs)
|
| 36 |
+
|
| 37 |
+
# ๋ฌธ์ฅ ์๋ฒ ๋ฉ ์ถ์ถ: [CLS] ํ ํฐ์ ์๋ฒ ๋ฉ์ ์ฌ์ฉํฉ๋๋ค.
|
| 38 |
+
# last_hidden_state์ ์ฒซ ๋ฒ์งธ ํ ํฐ (์ธ๋ฑ์ค 0)์ด [CLS] ํ ํฐ์ด๋ฉฐ, ์ ์ฒด ๋ฌธ์ฅ์ ๋ํํฉ๋๋ค.
|
| 39 |
+
# shape: (1, 768)
|
| 40 |
+
sentence_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
| 41 |
+
|
| 42 |
+
return sentence_embedding[0] # numpy ๋ฐฐ์ด (768์ฐจ์)๋ก ๋ฐํ
|
| 43 |
+
|
| 44 |
+
# 4. ๋น๊ทผ๋ง์ผ ๋ฆฌ๋ทฐ ์์ ์คํ
|
| 45 |
+
review_sentences = [
|
| 46 |
+
"ํ๋งค์๋ ๋งค๋๊ฐ ๋๋ฌด ์ข์์ ๊ธฐ๋ถ ์ข์ ๊ฑฐ๋์์ต๋๋ค.",
|
| 47 |
+
"๋ฌผ๊ฑด ์ํ๊ฐ ์๊ฐ๋ณด๋ค ๋ณ๋ก์ฌ์ ์์ฝ๋ค์. ๋ค์์ ๊ฑฐ๋ ์ ํ ๊ฒ ๊ฐ์์.",
|
| 48 |
+
"์ด ์์ ๊ฑฐ ๋ชจ๋ธ์ ์ค๊ณ ์์ธ๊ฐ ์ด๋ ์ ๋์ผ๊น์?",
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
print("\n--- KoBERT ์๋ฒ ๋ฉ ์ถ์ถ ๊ฒฐ๊ณผ ---")
|
| 52 |
+
for sentence in review_sentences:
|
| 53 |
+
embedding = get_kobert_embedding(sentence)
|
| 54 |
+
|
| 55 |
+
print(f"๋ฌธ์ฅ: '{sentence}'")
|
| 56 |
+
print(f" -> ์๋ฒ ๋ฉ ์ฐจ์: {embedding.shape}") # 768์ฐจ์
|
| 57 |
+
print(f" -> ์๋ฒ ๋ฉ ๋ฒกํฐ ์ผ๋ถ (์ฒซ 5๊ฐ): {embedding[:5].round(4)}")
|
| 58 |
+
print("-" * 30)
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
transformers
|
| 3 |
+
numpy
|
| 4 |
+
scikit-learn
|
| 5 |
+
datasets
|
| 6 |
+
accelerate
|
requirements_bk.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers
|
| 2 |
+
numpy
|
| 3 |
+
scikit-learn
|
| 4 |
+
datasets
|
| 5 |
+
accelerate
|
roberta_finetune.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import numpy as np
|
| 3 |
+
from datasets import load_dataset, Dataset
|
| 4 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
|
| 5 |
+
from sklearn.metrics import accuracy_score, f1_score
|
| 6 |
+
|
| 7 |
+
# 1. GPU/CPU ์ฅ์น ์ค์ (ํ์ต ์ Trainer๊ฐ ์๋์ผ๋ก ์ฒ๋ฆฌํ๋ฏ๋ก ํ์ธ์ฉ)
|
| 8 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 9 |
+
print(f"์ฌ์ฉ ์ฅ์น: {device}")
|
| 10 |
+
|
| 11 |
+
# 2. ๋ชจ๋ธ ๋ฐ ํ ํฌ๋์ด์ ๋ก๋
|
| 12 |
+
MODEL_NAME = "FacebookAI/xlm-roberta-base"
|
| 13 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 14 |
+
# ๋ถ๋ฅ๋ฅผ ์ํด AutoModelForSequenceClassification ๋ก๋ (๋ถ๋ฅ ํค๋๊ฐ ์ถ๊ฐ๋จ)
|
| 15 |
+
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
|
| 16 |
+
# num_labels=2: ๊ธ์ (1), ๋ถ์ (0)์ ๊ตฌ๋ถํ๋๋ก ์ค์
|
| 17 |
+
|
| 18 |
+
# 3. ๊ฐ์ ๋ฐ์ดํฐ์
์ค๋น ๋ฐ ์ ์ฒ๋ฆฌ
|
| 19 |
+
# ์ค์ ํ์ต ์์๋ ์ฌ๊ธฐ์ ๋น์ ์ ๋น๊ทผ๋ง์ผ ๋ฆฌ๋ทฐ ๋ฐ์ดํฐ๋ฅผ ๋ก๋ํด์ผ ํฉ๋๋ค.
|
| 20 |
+
data = {
|
| 21 |
+
'text': [
|
| 22 |
+
"๋งค๋๊ฐ ์ ๋ง ์ข์ผ์ธ์! ๊ธฐ๋ถ ์ข์ ๊ฑฐ๋์์ต๋๋ค.", # ๊ธ์
|
| 23 |
+
"๋ฌผ๊ฑด ์ํ๊ฐ ๋ณ๋ก๊ณ ๋ต๋ณ๋ ๋๋ฌด ๋๋ ธ์ด์.", # ๋ถ์
|
| 24 |
+
"๋น ๋ฅธ ์๋ต๊ณผ ๊น๋ํ ๊ฑฐ๋ ๊ฐ์ฌํฉ๋๋ค.", # ๊ธ์
|
| 25 |
+
"๊ฐ๊ฒฉ์ด ๋๋ฌด ๋น์ธ๋ค์. ๋น์ถ์
๋๋ค.", # ๋ถ์
|
| 26 |
+
"์ค๋๋ ๋ง์กฑ์ค๋ฌ์ด ์ค๊ณ ๊ฑฐ๋์์ต๋๋ค.", # ๊ธ์
|
| 27 |
+
"์๊ฐ ์ฝ์ ์ ์งํค๊ณ ์ฐ๋ฝ๋ ์ ์ ๋๋ค์.", # ๋ถ์
|
| 28 |
+
],
|
| 29 |
+
'label': [1, 0, 1, 0, 1, 0] # 1: ๊ธ์ , 0: ๋ถ์
|
| 30 |
+
}
|
| 31 |
+
raw_dataset = Dataset.from_dict(data)
|
| 32 |
+
|
| 33 |
+
# ๋ฐ์ดํฐ์
์ ํ์ต(train)๊ณผ ํ๊ฐ(test) ์ธํธ๋ก ๋ถํ (์์์ด๋ฏ๋ก 50:50)
|
| 34 |
+
train_test_split = raw_dataset.train_test_split(test_size=0.5, seed=42)
|
| 35 |
+
train_dataset = train_test_split['train']
|
| 36 |
+
eval_dataset = train_test_split['test']
|
| 37 |
+
|
| 38 |
+
def tokenize_function(examples):
|
| 39 |
+
# ์
๋ ฅ ํ
์คํธ๋ฅผ ํ ํฐํํฉ๋๋ค.
|
| 40 |
+
return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
|
| 41 |
+
|
| 42 |
+
# ๋ฐ์ดํฐ์
์ ํ ํฌ๋์ด์ ์ ์ฉ
|
| 43 |
+
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
|
| 44 |
+
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)
|
| 45 |
+
|
| 46 |
+
# 4. ํ๊ฐ ์งํ ํจ์ ์ ์
|
| 47 |
+
def compute_metrics(p):
|
| 48 |
+
# ์์ธก๋ ๋ก์ง(logits)์์ argmax๋ฅผ ์ทจํด ์์ธก ๋ ์ด๋ธ์ ์ป์ต๋๋ค.
|
| 49 |
+
predictions = np.argmax(p.predictions, axis=1)
|
| 50 |
+
# ์ ํ๋(Accuracy)์ F1-Score๋ฅผ ๊ณ์ฐํฉ๋๋ค.
|
| 51 |
+
acc = accuracy_score(p.label_ids, predictions)
|
| 52 |
+
f1 = f1_score(p.label_ids, predictions, average='binary') # ๊ธ์ (1)์ ๋ํ F1-Score
|
| 53 |
+
return {"accuracy": acc, "f1": f1}
|
| 54 |
+
|
| 55 |
+
# 5. ํ์ต ์ค์ (TrainingArguments)
|
| 56 |
+
OUTPUT_DIR = "./xlm-roberta-review-classifier" # ๋ชจ๋ธ์ ์ ์ฅํ ๊ฒฝ๋ก
|
| 57 |
+
training_args = TrainingArguments(
|
| 58 |
+
output_dir=OUTPUT_DIR,
|
| 59 |
+
num_train_epochs=3, # ํ์ต ํ์ (์ค์ ์์
์ 3~5ํ ๊ถ์ฅ)
|
| 60 |
+
per_device_train_batch_size=8, # GPU๋น ํ์ต ๋ฐฐ์น ํฌ๊ธฐ (VRAM์ ๋ฐ๋ผ ์กฐ์ )
|
| 61 |
+
per_device_eval_batch_size=8, # GPU๋น ํ๊ฐ ๋ฐฐ์น ํฌ๊ธฐ
|
| 62 |
+
warmup_steps=500, # ํ์ต๋ฅ ์ด ์ต๋์น์ ๋๋ฌํ๋ ๋จ๊ณ ์
|
| 63 |
+
weight_decay=0.01, # ๊ฐ์ค์น ๊ฐ์ (์ค๋ฒํผํ
๋ฐฉ์ง)
|
| 64 |
+
logging_dir='./logs', # ๋ก๊ทธ ์ ์ฅ ๊ฒฝ๋ก
|
| 65 |
+
logging_steps=10,
|
| 66 |
+
eval_strategy="epoch", # ์ํฌํฌ๋ง๋ค ํ๊ฐ ์ํ
|
| 67 |
+
save_strategy="epoch", # ์ํฌํฌ๋ง๋ค ๋ชจ๋ธ ์ ์ฅ
|
| 68 |
+
load_best_model_at_end=True, # ํ์ต ์ข
๋ฃ ์ ๊ฐ์ฅ ์ข์ ์ฑ๋ฅ์ ๋ชจ๋ธ ๋ก๋
|
| 69 |
+
fp16=torch.cuda.is_available(), # GPU ์ฌ์ฉ ์ ์๋๋ฅผ ์ํด fp16 ์ฌ์ฉ
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# 6. Trainer ๊ฐ์ฒด ์์ฑ ๋ฐ ํ์ต ์์
|
| 73 |
+
trainer = Trainer(
|
| 74 |
+
model=model,
|
| 75 |
+
args=training_args,
|
| 76 |
+
train_dataset=tokenized_train_dataset,
|
| 77 |
+
eval_dataset=tokenized_eval_dataset,
|
| 78 |
+
compute_metrics=compute_metrics,
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
print("\n--- ํ์ธ ํ๋ ์์ ---")
|
| 82 |
+
trainer.train()
|
| 83 |
+
|
| 84 |
+
# 7. ์ต์ข
๋ชจ๋ธ ์ ์ฅ
|
| 85 |
+
# ํ์ต๋ ๋ชจ๋ธ๊ณผ ํ ํฌ๋์ด์ ๋ฅผ ์ง์ ๋ ๊ฒฝ๋ก์ ์ ์ฅํฉ๋๋ค.
|
| 86 |
+
print(f"\n--- ํ์ธ ํ๋ ์๋ฃ, ๋ชจ๋ธ์ {OUTPUT_DIR}์ ์ ์ฅ ์ค ---")
|
| 87 |
+
trainer.save_model(OUTPUT_DIR)
|
| 88 |
+
tokenizer.save_pretrained(OUTPUT_DIR)
|
| 89 |
+
|
| 90 |
+
print("๋ชจ๋ธ ์ ์ฅ ์๋ฃ. ์ด์ ์ ์ฅ๋ ๋ชจ๋ธ์ ๋ก๋ํ์ฌ ๋ฐ๋ก ์ฌ์ฉํ ์ ์์ต๋๋ค.")
|
roberta_test.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import AutoTokenizer, AutoModel
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
# 1. GPU/CPU ์ฅ์น ์ค์
|
| 6 |
+
# CUDA (GPU) ์ฌ์ฉ ๊ฐ๋ฅํ๋ฉด 'cuda', ์๋๋ฉด 'cpu'๋ก ์ค์ ํฉ๋๋ค.
|
| 7 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 8 |
+
print(f"์ฌ์ฉ ์ฅ์น: {device}")
|
| 9 |
+
|
| 10 |
+
# 2. ๋ชจ๋ธ ๋ฐ ํ ํฌ๋์ด์ ๋ก๋
|
| 11 |
+
# XLM-RoBERTa-base๋ Sequence Classification์ด ์๋, ์ผ๋ฐ ์๋ฒ ๋ฉ ์ถ์ถ ๋ชจ๋ธ๋ก ๋ก๋ํฉ๋๋ค.
|
| 12 |
+
MODEL_NAME = "FacebookAI/xlm-roberta-base"
|
| 13 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 14 |
+
model = AutoModel.from_pretrained(MODEL_NAME)
|
| 15 |
+
|
| 16 |
+
# ๋ชจ๋ธ์ ์ค์ ๋ ์ฅ์น(GPU ๋๋ CPU)๋ก ์ด๋
|
| 17 |
+
model.to(device)
|
| 18 |
+
|
| 19 |
+
# 3. ์๋ฒ ๋ฉ(Embedding) ์ถ์ถ ํจ์ ์ ์
|
| 20 |
+
def get_text_embedding(text):
|
| 21 |
+
# ํ
์คํธ๋ฅผ ํ ํฐํํ๊ณ ์ฅ์น๋ก ์ด๋
|
| 22 |
+
inputs = tokenizer(
|
| 23 |
+
text,
|
| 24 |
+
return_tensors="pt", # PyTorch ํ
์๋ก ๋ฐํ
|
| 25 |
+
padding=True,
|
| 26 |
+
truncation=True
|
| 27 |
+
).to(device)
|
| 28 |
+
|
| 29 |
+
# ๋ชจ๋ธ ์ถ๋ก (Inference)
|
| 30 |
+
with torch.no_grad():
|
| 31 |
+
# output์๋ last_hidden_state (๊ฐ ํ ํฐ์ ์๋ฒ ๋ฉ) ๋ฑ์ด ํฌํจ๋ฉ๋๋ค.
|
| 32 |
+
outputs = model(**inputs)
|
| 33 |
+
|
| 34 |
+
# ๋ฌธ์ฅ ์๋ฒ ๋ฉ ์ถ์ถ: [CLS] ํ ํฐ์ ์๋ฒ ๋ฉ์ ์ฌ์ฉํฉ๋๋ค.
|
| 35 |
+
# last_hidden_state์ ์ฒซ ๋ฒ์งธ ํ ํฐ (์ธ๋ฑ์ค 0)์ด [CLS] ํ ํฐ์ด๋ฉฐ, ์ ์ฒด ๋ฌธ์ฅ์ ๋ํํฉ๋๋ค.
|
| 36 |
+
# shape: (1, 768)
|
| 37 |
+
sentence_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
| 38 |
+
|
| 39 |
+
return sentence_embedding[0] # numpy ๋ฐฐ์ด๋ก ๋ฐํ
|
| 40 |
+
|
| 41 |
+
# 4. ๋น๊ทผ๋ง์ผ ๋ฆฌ๋ทฐ ์์ ์คํ
|
| 42 |
+
review_sentences = [
|
| 43 |
+
"๋งค๋๊ฐ ์ ๋ง ์ข์ผ์๊ณ ๋ฌผ๊ฑด๋ ๊นจ๋ํด์ ๋ง์กฑ์ค๋ฌ์ ์ด์.",
|
| 44 |
+
"์ด๊ฑด ์ข ์๋๋ฏ. ๋ฌผ๊ฑด ์ํ๋ ๋ณ๋ก๊ณ ๋ต๋ณ๋ ๋๋ ธ์ต๋๋ค.",
|
| 45 |
+
"์ด ๋ชจ๋ธ์ ์ค๊ณ ์์ธ๋ ์ผ๋ง์ธ๊ฐ์?", # ์ผ๋ฐ์ ์ธ ์ง๋ฌธ ๋ฌธ์ฅ
|
| 46 |
+
"This is a great product for the price." # ์ธ๊ตญ์ด ๋ฌธ์ฅ๋ ์ฒ๋ฆฌ ๊ฐ๋ฅ
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
print("\n--- XLM-RoBERTa ์๋ฒ ๋ฉ ์ถ์ถ ๊ฒฐ๊ณผ ---")
|
| 50 |
+
for sentence in review_sentences:
|
| 51 |
+
embedding = get_text_embedding(sentence)
|
| 52 |
+
|
| 53 |
+
print(f"๋ฌธ์ฅ: '{sentence}'")
|
| 54 |
+
print(f" -> ์๋ฒ ๋ฉ ์ฐจ์: {embedding.shape}") # 768์ฐจ์
|
| 55 |
+
print(f" -> ์๋ฒ ๋ฉ ๋ฒกํฐ ์ผ๋ถ (์ฒซ 5๊ฐ): {embedding[:5].round(4)}")
|
| 56 |
+
print("-" * 20)
|
xtreme_distil_finetine.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import numpy as np
|
| 3 |
+
from datasets import Dataset
|
| 4 |
+
# IntervalStrategy๋ฅผ ๋ช
์์ ์ผ๋ก ์ํฌํธํ์ฌ ๋ฒ์ ์ถฉ๋์ ๋ฐฉ์งํฉ๋๋ค.
|
| 5 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, IntervalStrategy
|
| 6 |
+
from sklearn.metrics import accuracy_score, f1_score
|
| 7 |
+
|
| 8 |
+
# 1. GPU/CPU ์ฅ์น ์ค์
|
| 9 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 10 |
+
print(f"์ฌ์ฉ ์ฅ์น: {device}")
|
| 11 |
+
|
| 12 |
+
# 2. ๋ชจ๋ธ ๋ฐ ํ ํฌ๋์ด์ ๋ก๋ (๊ฒฝ๋ ๋ชจ๋ธ ์ฌ์ฉ)
|
| 13 |
+
MODEL_NAME = "microsoft/xtremedistil-l12-h384-uncased"
|
| 14 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 15 |
+
# AutoModelForSequenceClassification์ ๋ก๋ํ์ฌ ๋ถ๋ฅ์ธต์ ์ถ๊ฐํฉ๋๋ค.
|
| 16 |
+
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
|
| 17 |
+
# num_labels=2: ์ด์ง ๋ถ๋ฅ (๊ธ์ : 1, ๋ถ์ : 0)
|
| 18 |
+
|
| 19 |
+
# 3. ๊ฐ์ ๋ฐ์ดํฐ์
์ค๋น ๋ฐ ์ ์ฒ๋ฆฌ
|
| 20 |
+
# ์ค์ ์ฌ์ฉ ์์๋ ์ด ๋ถ๋ถ์ ๋น์ ์ ํ๊ตญ์ด ๋ฆฌ๋ทฐ ๋ฐ์ดํฐ๋ก ๋์ฒดํด์ผ ํฉ๋๋ค.
|
| 21 |
+
data = {
|
| 22 |
+
'text': [
|
| 23 |
+
"๋งค๋๊ฐ ์ ๋ง ์ข์ผ์ธ์! ๊ธฐ๋ถ ์ข์ ๊ฑฐ๋์์ต๋๋ค.",
|
| 24 |
+
"๋ฌผ๊ฑด ์ํ๊ฐ ๋ณ๋ก๊ณ ๋ต๋ณ๋ ๋๋ฌด ๋๋ ธ์ด์.",
|
| 25 |
+
"๋น ๋ฅธ ์๋ต๊ณผ ๊น๋ํ ๊ฑฐ๋ ๊ฐ์ฌํฉ๋๋ค.",
|
| 26 |
+
"๊ฐ๊ฒฉ์ด ๋๋ฌด ๋น์ธ๋ค์. ๋น์ถ์
๋๋ค.",
|
| 27 |
+
"์ค๋๋ ๋ง์กฑ์ค๋ฌ์ด ์ค๊ณ ๊ฑฐ๋์์ต๋๋ค.",
|
| 28 |
+
"์๊ฐ ์ฝ์ ์ ์งํค๊ณ ์ฐ๋ฝ๋ ์ ์ ๋๋ค์.",
|
| 29 |
+
"์น์ ํจ ๋๋ถ์ ๊ฑฐ๋ ๊ณผ์ ์ด ์์กฐ๋ก์ ์ต๋๋ค.",
|
| 30 |
+
"ํ๋งค๊ธ๊ณผ ์ค์ ์ ํ์ด ๋ฌ๋ผ์ ์ค๋งํ์ต๋๋ค.",
|
| 31 |
+
],
|
| 32 |
+
'label': [1, 0, 1, 0, 1, 0, 1, 0] # 1: ๊ธ์ , 0: ๋ถ์
|
| 33 |
+
}
|
| 34 |
+
raw_dataset = Dataset.from_dict(data)
|
| 35 |
+
|
| 36 |
+
# ๋ฐ์ดํฐ์
์ ํ์ต(train)๊ณผ ํ๊ฐ(test) ์ธํธ๋ก ๋ถํ (8๊ฐ ์ค 4๊ฐ์ฉ ๋ถํ )
|
| 37 |
+
train_test_split = raw_dataset.train_test_split(test_size=0.5, seed=42)
|
| 38 |
+
train_dataset = train_test_split['train']
|
| 39 |
+
eval_dataset = train_test_split['test']
|
| 40 |
+
|
| 41 |
+
def tokenize_function(examples):
|
| 42 |
+
# ์
๋ ฅ ํ
์คํธ๋ฅผ ํ ํฐํํ๊ณ , ๊ฒฝ๋ ๋ชจ๋ธ์ ๋ง๊ฒ max_length๋ฅผ ์ง์ ํฉ๋๋ค.
|
| 43 |
+
return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
|
| 44 |
+
|
| 45 |
+
# ๋ฐ์ดํฐ์
์ ํ ํฌ๋์ด์ ์ ์ฉ ๋ฐ PyTorch ํ
์ ํ์์ผ๋ก ์ง์
|
| 46 |
+
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True).with_format("torch")
|
| 47 |
+
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True).with_format("torch")
|
| 48 |
+
|
| 49 |
+
# 4. ํ๊ฐ ์งํ ํจ์ ์ ์
|
| 50 |
+
def compute_metrics(p):
|
| 51 |
+
predictions = np.argmax(p.predictions, axis=1)
|
| 52 |
+
acc = accuracy_score(p.label_ids, predictions)
|
| 53 |
+
f1 = f1_score(p.label_ids, predictions, average='binary') # ๊ธ์ (1)์ ๋ํ F1-Score
|
| 54 |
+
return {"accuracy": acc, "f1": f1}
|
| 55 |
+
|
| 56 |
+
# 5. ํ์ต ์ค์ (TrainingArguments)
|
| 57 |
+
OUTPUT_DIR = "./xtreme-distil-review-classifier" # ๋ชจ๋ธ ์ ์ฅ ๊ฒฝ๋ก
|
| 58 |
+
training_args = TrainingArguments(
|
| 59 |
+
output_dir=OUTPUT_DIR,
|
| 60 |
+
num_train_epochs=5, # ๊ฒฝ๋ ๋ชจ๋ธ์ด๋ฏ๋ก ์ํฌํฌ ์๋ฅผ ์ฝ๊ฐ ๋๋ ธ์ต๋๋ค.
|
| 61 |
+
per_device_train_batch_size=8, # ๋ฐฐ์น ํฌ๊ธฐ
|
| 62 |
+
per_device_eval_batch_size=8,
|
| 63 |
+
warmup_steps=500,
|
| 64 |
+
weight_decay=0.01,
|
| 65 |
+
logging_dir='./logs',
|
| 66 |
+
logging_steps=10,
|
| 67 |
+
|
| 68 |
+
# ํ๊ฐ ๋ฐ ์ ์ฅ ์ ๋ต์ 'EPOCH'์ผ๋ก ํต์ผํ์ฌ load_best_model_at_end๋ฅผ ํ์ฑํํฉ๋๋ค.
|
| 69 |
+
eval_strategy=IntervalStrategy.EPOCH,
|
| 70 |
+
save_strategy=IntervalStrategy.EPOCH,
|
| 71 |
+
|
| 72 |
+
load_best_model_at_end=True,
|
| 73 |
+
fp16=torch.cuda.is_available(),
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# 6. Trainer ๊ฐ์ฒด ์์ฑ ๋ฐ ํ์ต ์์
|
| 77 |
+
trainer = Trainer(
|
| 78 |
+
model=model,
|
| 79 |
+
args=training_args,
|
| 80 |
+
train_dataset=tokenized_train_dataset,
|
| 81 |
+
eval_dataset=tokenized_eval_dataset,
|
| 82 |
+
compute_metrics=compute_metrics,
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
print("\n--- ํ์ธ ํ๋ ์์ (XTREME-Distil ๋ชจ๋ธ) ---")
|
| 86 |
+
trainer.train()
|
| 87 |
+
|
| 88 |
+
# 7. ์ต์ข
๋ชจ๋ธ ์ ์ฅ
|
| 89 |
+
print(f"\n--- ํ์ธ ํ๋ ์๋ฃ, ๋ชจ๋ธ์ {OUTPUT_DIR}์ ์ ์ฅ ์ค ---")
|
| 90 |
+
trainer.save_model(OUTPUT_DIR)
|
| 91 |
+
tokenizer.save_pretrained(OUTPUT_DIR)
|
| 92 |
+
|
| 93 |
+
print("๋ชจ๋ธ ์ ์ฅ ์๋ฃ. ์ด์ ์ ์ฅ๋ ๋ชจ๋ธ์ ๋ก๋ํ์ฌ ๋ฐ๋ก ์ฌ์ฉํ ์ ์์ต๋๋ค.")
|