Spaces:
Runtime error
Runtime error
| import torch | |
| import numpy as np | |
| from datasets import Dataset | |
| # IntervalStrategy๋ฅผ ๋ช ์์ ์ผ๋ก ์ํฌํธํ์ฌ ๋ฒ์ ์ถฉ๋์ ๋ฐฉ์งํฉ๋๋ค. | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, IntervalStrategy | |
| from sklearn.metrics import accuracy_score, f1_score | |
| # 1. GPU/CPU ์ฅ์น ์ค์ | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(f"์ฌ์ฉ ์ฅ์น: {device}") | |
| # 2. ๋ชจ๋ธ ๋ฐ ํ ํฌ๋์ด์ ๋ก๋ (๊ฒฝ๋ ๋ชจ๋ธ ์ฌ์ฉ) | |
| MODEL_NAME = "microsoft/xtremedistil-l12-h384-uncased" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| # AutoModelForSequenceClassification์ ๋ก๋ํ์ฌ ๋ถ๋ฅ์ธต์ ์ถ๊ฐํฉ๋๋ค. | |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) | |
| # num_labels=2: ์ด์ง ๋ถ๋ฅ (๊ธ์ : 1, ๋ถ์ : 0) | |
| # 3. ๊ฐ์ ๋ฐ์ดํฐ์ ์ค๋น ๋ฐ ์ ์ฒ๋ฆฌ | |
| # ์ค์ ์ฌ์ฉ ์์๋ ์ด ๋ถ๋ถ์ ๋น์ ์ ํ๊ตญ์ด ๋ฆฌ๋ทฐ ๋ฐ์ดํฐ๋ก ๋์ฒดํด์ผ ํฉ๋๋ค. | |
| data = { | |
| 'text': [ | |
| "๋งค๋๊ฐ ์ ๋ง ์ข์ผ์ธ์! ๊ธฐ๋ถ ์ข์ ๊ฑฐ๋์์ต๋๋ค.", | |
| "๋ฌผ๊ฑด ์ํ๊ฐ ๋ณ๋ก๊ณ ๋ต๋ณ๋ ๋๋ฌด ๋๋ ธ์ด์.", | |
| "๋น ๋ฅธ ์๋ต๊ณผ ๊น๋ํ ๊ฑฐ๋ ๊ฐ์ฌํฉ๋๋ค.", | |
| "๊ฐ๊ฒฉ์ด ๋๋ฌด ๋น์ธ๋ค์. ๋น์ถ์ ๋๋ค.", | |
| "์ค๋๋ ๋ง์กฑ์ค๋ฌ์ด ์ค๊ณ ๊ฑฐ๋์์ต๋๋ค.", | |
| "์๊ฐ ์ฝ์ ์ ์งํค๊ณ ์ฐ๋ฝ๋ ์ ์ ๋๋ค์.", | |
| "์น์ ํจ ๋๋ถ์ ๊ฑฐ๋ ๊ณผ์ ์ด ์์กฐ๋ก์ ์ต๋๋ค.", | |
| "ํ๋งค๊ธ๊ณผ ์ค์ ์ ํ์ด ๋ฌ๋ผ์ ์ค๋งํ์ต๋๋ค.", | |
| ], | |
| 'label': [1, 0, 1, 0, 1, 0, 1, 0] # 1: ๊ธ์ , 0: ๋ถ์ | |
| } | |
| raw_dataset = Dataset.from_dict(data) | |
| # ๋ฐ์ดํฐ์ ์ ํ์ต(train)๊ณผ ํ๊ฐ(test) ์ธํธ๋ก ๋ถํ (8๊ฐ ์ค 4๊ฐ์ฉ ๋ถํ ) | |
| train_test_split = raw_dataset.train_test_split(test_size=0.5, seed=42) | |
| train_dataset = train_test_split['train'] | |
| eval_dataset = train_test_split['test'] | |
| def tokenize_function(examples): | |
| # ์ ๋ ฅ ํ ์คํธ๋ฅผ ํ ํฐํํ๊ณ , ๊ฒฝ๋ ๋ชจ๋ธ์ ๋ง๊ฒ max_length๋ฅผ ์ง์ ํฉ๋๋ค. | |
| return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128) | |
| # ๋ฐ์ดํฐ์ ์ ํ ํฌ๋์ด์ ์ ์ฉ ๋ฐ PyTorch ํ ์ ํ์์ผ๋ก ์ง์ | |
| tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True).with_format("torch") | |
| tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True).with_format("torch") | |
| # 4. ํ๊ฐ ์งํ ํจ์ ์ ์ | |
| def compute_metrics(p): | |
| predictions = np.argmax(p.predictions, axis=1) | |
| acc = accuracy_score(p.label_ids, predictions) | |
| f1 = f1_score(p.label_ids, predictions, average='binary') # ๊ธ์ (1)์ ๋ํ F1-Score | |
| return {"accuracy": acc, "f1": f1} | |
| # 5. ํ์ต ์ค์ (TrainingArguments) | |
| OUTPUT_DIR = "./xtreme-distil-review-classifier" # ๋ชจ๋ธ ์ ์ฅ ๊ฒฝ๋ก | |
| training_args = TrainingArguments( | |
| output_dir=OUTPUT_DIR, | |
| num_train_epochs=5, # ๊ฒฝ๋ ๋ชจ๋ธ์ด๋ฏ๋ก ์ํฌํฌ ์๋ฅผ ์ฝ๊ฐ ๋๋ ธ์ต๋๋ค. | |
| per_device_train_batch_size=8, # ๋ฐฐ์น ํฌ๊ธฐ | |
| per_device_eval_batch_size=8, | |
| warmup_steps=500, | |
| weight_decay=0.01, | |
| logging_dir='./logs', | |
| logging_steps=10, | |
| # ํ๊ฐ ๋ฐ ์ ์ฅ ์ ๋ต์ 'EPOCH'์ผ๋ก ํต์ผํ์ฌ load_best_model_at_end๋ฅผ ํ์ฑํํฉ๋๋ค. | |
| eval_strategy=IntervalStrategy.EPOCH, | |
| save_strategy=IntervalStrategy.EPOCH, | |
| load_best_model_at_end=True, | |
| fp16=torch.cuda.is_available(), | |
| ) | |
| # 6. Trainer ๊ฐ์ฒด ์์ฑ ๋ฐ ํ์ต ์์ | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_train_dataset, | |
| eval_dataset=tokenized_eval_dataset, | |
| compute_metrics=compute_metrics, | |
| ) | |
| print("\n--- ํ์ธ ํ๋ ์์ (XTREME-Distil ๋ชจ๋ธ) ---") | |
| trainer.train() | |
| # 7. ์ต์ข ๋ชจ๋ธ ์ ์ฅ | |
| print(f"\n--- ํ์ธ ํ๋ ์๋ฃ, ๋ชจ๋ธ์ {OUTPUT_DIR}์ ์ ์ฅ ์ค ---") | |
| trainer.save_model(OUTPUT_DIR) | |
| tokenizer.save_pretrained(OUTPUT_DIR) | |
| print("๋ชจ๋ธ ์ ์ฅ ์๋ฃ. ์ด์ ์ ์ฅ๋ ๋ชจ๋ธ์ ๋ก๋ํ์ฌ ๋ฐ๋ก ์ฌ์ฉํ ์ ์์ต๋๋ค.") |