Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,68 +5,19 @@ import pickle
|
|
| 5 |
import gluonnlp as nlp
|
| 6 |
import numpy as np
|
| 7 |
import os
|
| 8 |
-
import sys # sys ๋ชจ๋ ์ํฌํธ
|
| 9 |
|
| 10 |
-
#
|
| 11 |
-
from transformers import BertModel,
|
| 12 |
from torch.utils.data import Dataset, DataLoader
|
| 13 |
import logging # ๋ก๊น
๋ชจ๋ ์ํฌํธ ์ ์ง
|
| 14 |
-
from huggingface_hub import hf_hub_download # hf_hub_download ์ํฌํธ
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
# --- 1. BERTClassifier ๋ชจ๋ธ ํด๋์ค ์ ์ (model.py์์ ์ฎ๊ฒจ์ด) ---
|
| 18 |
-
class BERTClassifier(torch.nn.Module):
|
| 19 |
-
def __init__(self,
|
| 20 |
-
bert,
|
| 21 |
-
hidden_size = 768,
|
| 22 |
-
num_classes=5, # ๋ถ๋ฅํ ํด๋์ค ์ (category ๋์
๋๋ฆฌ ํฌ๊ธฐ์ ์ผ์น)
|
| 23 |
-
dr_rate=None,
|
| 24 |
-
params=None):
|
| 25 |
-
super(BERTClassifier, self).__init__()
|
| 26 |
-
self.bert = bert
|
| 27 |
-
self.dr_rate = dr_rate
|
| 28 |
-
|
| 29 |
-
self.classifier = torch.nn.Linear(hidden_size , num_classes)
|
| 30 |
-
if dr_rate:
|
| 31 |
-
self.dropout = torch.nn.Dropout(p=dr_rate)
|
| 32 |
-
|
| 33 |
-
def gen_attention_mask(self, token_ids, valid_length):
|
| 34 |
-
attention_mask = torch.zeros_like(token_ids)
|
| 35 |
-
for i, v in enumerate(valid_length):
|
| 36 |
-
attention_mask[i][:v] = 1
|
| 37 |
-
return attention_mask.float()
|
| 38 |
-
|
| 39 |
-
def forward(self, token_ids, valid_length, segment_ids):
|
| 40 |
-
attention_mask = self.gen_attention_mask(token_ids, valid_length)
|
| 41 |
-
|
| 42 |
-
_, pooler = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(), attention_mask=attention_mask.float().to(token_ids.device), return_dict=False)
|
| 43 |
-
|
| 44 |
-
if self.dr_rate:
|
| 45 |
-
out = self.dropout(pooler)
|
| 46 |
-
else:
|
| 47 |
-
out = pooler
|
| 48 |
-
return self.classifier(out)
|
| 49 |
|
| 50 |
-
# ---
|
| 51 |
-
class BERTDataset(Dataset):
|
| 52 |
-
def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len, pad, pair):
|
| 53 |
-
# nlp.data.BERTSentenceTransform์ ํ ํฌ๋์ด์ ํจ์๋ฅผ ๋ฐ์ต๋๋ค.
|
| 54 |
-
# AutoTokenizer์ tokenize ๋ฉ์๋๋ฅผ ์ง์ ์ ๋ฌํฉ๋๋ค.
|
| 55 |
-
transform = nlp.data.BERTSentenceTransform(
|
| 56 |
-
bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair
|
| 57 |
-
)
|
| 58 |
-
self.sentences = [transform([i[sent_idx]]) for i in dataset]
|
| 59 |
-
self.labels = [np.int32(i[label_idx]) for i in dataset]
|
| 60 |
-
|
| 61 |
-
def __getitem__(self, i):
|
| 62 |
-
return (self.sentences[i] + (self.labels[i],))
|
| 63 |
-
|
| 64 |
-
def __len__(self):
|
| 65 |
-
return len(self.labels)
|
| 66 |
-
|
| 67 |
-
# --- 3. FastAPI ์ฑ ๋ฐ ์ ์ญ ๋ณ์ ์ค์ ---
|
| 68 |
app = FastAPI()
|
| 69 |
-
device = torch.device("cpu") #
|
| 70 |
|
| 71 |
# โ
category ๋ก๋ (GitHub ์ ์ฅ์ ๋ฃจํธ์ ์์ด์ผ ํจ)
|
| 72 |
try:
|
|
@@ -87,38 +38,24 @@ except FileNotFoundError:
|
|
| 87 |
sys.exit(1) # ํ์ผ ์์ผ๋ฉด ์๋น์ค ์์ํ์ง ์์
|
| 88 |
|
| 89 |
# โ
ํ ํฌ๋์ด์ ๋ก๋ (transformers.AutoTokenizer ์ฌ์ฉ)
|
| 90 |
-
# KoBERTTokenizer ๋์ AutoTokenizer๋ฅผ ์ฌ์ฉํ์ฌ KoBERT ๋ชจ๋ธ์ ํ ํฌ๋์ด์ ๋ฅผ ๋ก๋ํฉ๋๋ค.
|
| 91 |
-
# ์ด๋ ๊ฒ ํ๋ฉด XLNetTokenizer ๊ฒฝ๊ณ ๋ฐ kobert_tokenizer ์ค์น ๋ฌธ์ ๋ฅผ ํผํ ์ ์์ต๋๋ค.
|
| 92 |
tokenizer = AutoTokenizer.from_pretrained('skt/kobert-base-v1')
|
| 93 |
print("ํ ํฌ๋์ด์ ๋ก๋ ์ฑ๊ณต.")
|
| 94 |
|
| 95 |
-
# โ
๋ชจ๋ธ ๋ก๋
|
| 96 |
-
#
|
| 97 |
-
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1')
|
| 98 |
-
model = BERTClassifier(
|
| 99 |
-
bertmodel,
|
| 100 |
-
dr_rate=0.5, # ํ์ต ์ ์ฌ์ฉ๋ dr_rate ๊ฐ์ผ๋ก ๋ณ๊ฒฝํ์ธ์.
|
| 101 |
-
num_classes=len(category)
|
| 102 |
-
)
|
| 103 |
-
|
| 104 |
-
# textClassifierModel.pt ํ์ผ ๋ก๋
|
| 105 |
try:
|
| 106 |
HF_MODEL_REPO_ID = "hiddenFront/TextClassifier" # ์ฌ์ฉ์๋์ ์ค์ Hugging Face ์ ์ฅ์ ID
|
| 107 |
-
HF_MODEL_FILENAME = "textClassifierModel.pt"
|
|
|
|
| 108 |
model_path = hf_hub_download(repo_id=HF_MODEL_REPO_ID, filename=HF_MODEL_FILENAME)
|
| 109 |
print(f"๋ชจ๋ธ ํ์ผ์ด '{model_path}'์ ์ฑ๊ณต์ ์ผ๋ก ๋ค์ด๋ก๋๋์์ต๋๋ค.")
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
name = name[7:]
|
| 118 |
-
new_state_dict[name] = v
|
| 119 |
-
|
| 120 |
-
model.load_state_dict(new_state_dict)
|
| 121 |
-
model.to(device) # ๋ชจ๋ธ์ ๋๋ฐ์ด์ค๋ก ์ด๋
|
| 122 |
model.eval() # ์ถ๋ก ๋ชจ๋๋ก ์ค์
|
| 123 |
print("๋ชจ๋ธ ๋ก๋ ์ฑ๊ณต.")
|
| 124 |
|
|
@@ -127,6 +64,25 @@ except Exception as e:
|
|
| 127 |
sys.exit(1) # ๋ชจ๋ธ ๋ก๋ ์คํจ ์ ์๋น์ค ์์ํ์ง ์์
|
| 128 |
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
# โ
๋ฐ์ดํฐ์
์์ฑ์ ํ์ํ ํ๋ผ๋ฏธํฐ
|
| 131 |
max_len = 64
|
| 132 |
batch_size = 32
|
|
@@ -136,7 +92,8 @@ def predict(predict_sentence):
|
|
| 136 |
data = [predict_sentence, '0']
|
| 137 |
dataset_another = [data]
|
| 138 |
# num_workers๋ ๋ฐฐํฌ ํ๊ฒฝ์์ 0์ผ๋ก ์ค์ ๊ถ์ฅ
|
| 139 |
-
|
|
|
|
| 140 |
test_dataLoader = DataLoader(another_test, batch_size=batch_size, num_workers=0)
|
| 141 |
|
| 142 |
model.eval() # ์์ธก ์ ๋ชจ๋ธ์ ํ๊ฐ ๋ชจ๋๋ก ์ค์
|
|
|
|
| 5 |
import gluonnlp as nlp
|
| 6 |
import numpy as np
|
| 7 |
import os
|
| 8 |
+
import sys # ์ค๋ฅ ์ ์๋น์ค ์ข
๋ฃ๋ฅผ ์ํด sys ๋ชจ๋ ์ํฌํธ
|
| 9 |
|
| 10 |
+
# transformers์ AutoTokenizer๋ง ์ฌ์ฉํฉ๋๋ค.
|
| 11 |
+
from transformers import AutoTokenizer # BertModel, BertForSequenceClassification ๋ฑ์ ์ด์ ์ง์ ํ์ ์์ต๋๋ค.
|
| 12 |
from torch.utils.data import Dataset, DataLoader
|
| 13 |
import logging # ๋ก๊น
๋ชจ๋ ์ํฌํธ ์ ์ง
|
| 14 |
+
from huggingface_hub import hf_hub_download # hf_hub_download ์ํฌํธ ์ ์ง
|
| 15 |
+
# collections ๋ชจ๋์ ๋ ์ด์ ํ์ ์์ ์ ์์ง๋ง, ํน์ ๋ชฐ๋ผ ์ ์งํฉ๋๋ค.
|
| 16 |
+
import collections
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
# --- 1. FastAPI ์ฑ ๋ฐ ์ ์ญ ๋ณ์ ์ค์ ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
app = FastAPI()
|
| 20 |
+
device = torch.device("cpu") # Hugging Face Spaces์ ๋ฌด๋ฃ ํฐ์ด๋ ์ฃผ๋ก CPU๋ฅผ ์ฌ์ฉํฉ๋๋ค.
|
| 21 |
|
| 22 |
# โ
category ๋ก๋ (GitHub ์ ์ฅ์ ๋ฃจํธ์ ์์ด์ผ ํจ)
|
| 23 |
try:
|
|
|
|
| 38 |
sys.exit(1) # ํ์ผ ์์ผ๋ฉด ์๋น์ค ์์ํ์ง ์์
|
| 39 |
|
| 40 |
# โ
ํ ํฌ๋์ด์ ๋ก๋ (transformers.AutoTokenizer ์ฌ์ฉ)
|
|
|
|
|
|
|
| 41 |
tokenizer = AutoTokenizer.from_pretrained('skt/kobert-base-v1')
|
| 42 |
print("ํ ํฌ๋์ด์ ๋ก๋ ์ฑ๊ณต.")
|
| 43 |
|
| 44 |
+
# โ
๋ชจ๋ธ ๋ก๋ (Hugging Face Hub์์ ๋ค์ด๋ก๋)
|
| 45 |
+
# textClassifierModel.pt ํ์ผ์ ์ด๋ฏธ ๊ฒฝ๋ํ๋ '์์ ํ ๋ชจ๋ธ ๊ฐ์ฒด'๋ผ๊ณ ๊ฐ์ ํ๊ณ ์ง์ ๋ก๋ํฉ๋๋ค.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
try:
|
| 47 |
HF_MODEL_REPO_ID = "hiddenFront/TextClassifier" # ์ฌ์ฉ์๋์ ์ค์ Hugging Face ์ ์ฅ์ ID
|
| 48 |
+
HF_MODEL_FILENAME = "textClassifierModel.pt" # Hugging Face Hub์ ์
๋ก๋ํ ํ์ผ ์ด๋ฆ๊ณผ ์ผ์นํด์ผ ํฉ๋๋ค.
|
| 49 |
+
|
| 50 |
model_path = hf_hub_download(repo_id=HF_MODEL_REPO_ID, filename=HF_MODEL_FILENAME)
|
| 51 |
print(f"๋ชจ๋ธ ํ์ผ์ด '{model_path}'์ ์ฑ๊ณต์ ์ผ๋ก ๋ค์ด๋ก๋๋์์ต๋๋ค.")
|
| 52 |
|
| 53 |
+
# --- ์์ ๋ ํต์ฌ ๋ถ๋ถ ---
|
| 54 |
+
# ๊ฒฝ๋ํ๋ ๋ชจ๋ธ ๊ฐ์ฒด๋ฅผ ์ง์ ๋ก๋ํฉ๋๋ค.
|
| 55 |
+
# ์ด ํ์ผ์ ์ด๋ฏธ PyTorch ๋ชจ๋ธ ๊ฐ์ฒด(์์ํ๋ ๋ชจ๋ธ ํฌํจ)์ด๋ฏ๋ก ๋ฐ๋ก ๋ก๋ํ์ฌ ์ฌ์ฉํฉ๋๋ค.
|
| 56 |
+
model = torch.load(model_path, map_location=device)
|
| 57 |
+
# --- ์์ ๋ ํต์ฌ ๋ถ๋ถ ๋ ---
|
| 58 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
model.eval() # ์ถ๋ก ๋ชจ๋๋ก ์ค์
|
| 60 |
print("๋ชจ๋ธ ๋ก๋ ์ฑ๊ณต.")
|
| 61 |
|
|
|
|
| 64 |
sys.exit(1) # ๋ชจ๋ธ ๋ก๋ ์คํจ ์ ์๋น์ค ์์ํ์ง ์์
|
| 65 |
|
| 66 |
|
| 67 |
+
# --- 2. BERTDataset ํด๋์ค ์ ์ (dataset.py์์ ์ฎ๊ฒจ์ด) ---
|
| 68 |
+
# ์ด ํด๋์ค๋ ๋ฐ์ดํฐ๋ฅผ ๋ชจ๋ธ ์
๋ ฅ ํ์์ผ๋ก ๋ณํํฉ๋๋ค.
|
| 69 |
+
class BERTDataset(Dataset):
|
| 70 |
+
def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len, pad, pair):
|
| 71 |
+
# nlp.data.BERTSentenceTransform์ ํ ํฌ๋์ด์ ํจ์๋ฅผ ๋ฐ์ต๋๋ค.
|
| 72 |
+
# AutoTokenizer์ tokenize ๋ฉ์๋๋ฅผ ์ง์ ์ ๋ฌํฉ๋๋ค.
|
| 73 |
+
transform = nlp.data.BERTSentenceTransform(
|
| 74 |
+
bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair
|
| 75 |
+
)
|
| 76 |
+
self.sentences = [transform([i[sent_idx]]) for i in dataset]
|
| 77 |
+
self.labels = [np.int32(i[label_idx]) for i in dataset]
|
| 78 |
+
|
| 79 |
+
def __getitem__(self, i):
|
| 80 |
+
return (self.sentences[i] + (self.labels[i],))
|
| 81 |
+
|
| 82 |
+
def __len__(self):
|
| 83 |
+
return len(self.labels)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
# โ
๋ฐ์ดํฐ์
์์ฑ์ ํ์ํ ํ๋ผ๋ฏธํฐ
|
| 87 |
max_len = 64
|
| 88 |
batch_size = 32
|
|
|
|
| 92 |
data = [predict_sentence, '0']
|
| 93 |
dataset_another = [data]
|
| 94 |
# num_workers๋ ๋ฐฐํฌ ํ๊ฒฝ์์ 0์ผ๋ก ์ค์ ๊ถ์ฅ
|
| 95 |
+
# tokenizer.tokenize๋ฅผ BERTDataset์ ์ ๋ฌํฉ๋๋ค.
|
| 96 |
+
another_test = BERTDataset(dataset_another, 0, 1, tokenizer.tokenize, vocab, max_len, True, False)
|
| 97 |
test_dataLoader = DataLoader(another_test, batch_size=batch_size, num_workers=0)
|
| 98 |
|
| 99 |
model.eval() # ์์ธก ์ ๋ชจ๋ธ์ ํ๊ฐ ๋ชจ๋๋ก ์ค์
|