Spaces:

hiddenFront
/

textClassifierAPI

Sleeping

App Files Files Community

hiddenFront commited on Jul 31

Commit

8153817

verified ·

1 Parent(s): 041f67a

Update app.py

Browse files

Files changed (1) hide show

app.py +134 -28

app.py CHANGED Viewed

@@ -5,35 +5,134 @@ import pickle
 import gluonnlp as nlp
 import numpy as np
 import os
-from kobert_tokenizer import KoBERTTokenizer
-from model import BERTClassifier
-from dataset import BERTDataset
-from transformers import BertModel
-import logging
 app = FastAPI()
-device = torch.device("cpu")
-# ✅ category 로드
-with open("category.pkl", "rb") as f:
-    category = pickle.load(f)
-# ✅ vocab 로드
-with open("vocab.pkl", "rb") as f:
-    vocab = pickle.load(f)
-# ✅ 토크나이저
 tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
 # ✅ 모델 로드
 model = BERTClassifier(
     BertModel.from_pretrained('skt/kobert-base-v1'),
-    dr_rate=0.5,
     num_classes=len(category)
 )
-model.load_state_dict(torch.load("textClassifierModel.pt", map_location=device))
-model.to(device)
-model.eval()
 # ✅ 데이터셋 생성에 필요한 파라미터
 max_len = 64
@@ -43,20 +142,26 @@ batch_size = 32
 def predict(predict_sentence):
     data = [predict_sentence, '0']
     dataset_another = [data]
     another_test = BERTDataset(dataset_another, 0, 1, tokenizer, vocab, max_len, True, False)
-    test_dataLoader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=0)
-    model.eval()
-    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataLoader):
-        token_ids = token_ids.long().to(device)
-        segment_ids = segment_ids.long().to(device)
-        out = model(token_ids, valid_length, segment_ids)
-        test_eval = []
-        for i in out:
-            logits = i.detach().cpu().numpy()
-            test_eval.append(list(category.keys())[np.argmax(logits)])
-        return test_eval[0]
 # ✅ 엔드포인트 정의
 class InputText(BaseModel):
@@ -70,3 +175,4 @@ def root():
 async def predict_route(item: InputText):
     result = predict(item.text)
     return {"text": item.text, "classification": result}

 import gluonnlp as nlp
 import numpy as np
 import os
+from kobert_tokenizer import KoBERTTokenizer # kobert_tokenizer 임포트 유지
+from transformers import BertModel # BertModel 임포트 유지
+from torch.utils.data import Dataset, DataLoader # DataLoader 임포트 추가
+import logging # 로깅 모듈 임포트 유지
+# --- 1. BERTClassifier 모델 클래스 정의 (model.py에서 옮겨옴) ---
+# 이 클래스는 모델의 아키텍처를 정의합니다.
+class BERTClassifier(torch.nn.Module):
+    def __init__(self,
+                 bert,
+                 hidden_size = 768,
+                 num_classes=5, # 분류할 클래스 수 (category 딕셔너리 크기와 일치)
+                 dr_rate=None,
+                 params=None):
+        super(BERTClassifier, self).__init__()
+        self.bert = bert
+        self.dr_rate = dr_rate
+        self.classifier = torch.nn.Linear(hidden_size , num_classes)
+        if dr_rate:
+            self.dropout = torch.nn.Dropout(p=dr_rate)
+    def gen_attention_mask(self, token_ids, valid_length):
+        attention_mask = torch.zeros_like(token_ids)
+        for i, v in enumerate(valid_length):
+            attention_mask[i][:v] = 1
+        return attention_mask.float()
+    def forward(self, token_ids, valid_length, segment_ids):
+        attention_mask = self.gen_attention_mask(token_ids, valid_length)
+        # BertModel의 출력 구조에 따라 수정
+        # Hugging Face Transformers의 BertModel은 (last_hidden_state, pooler_output, ...) 반환
+        # pooler_output (CLS 토큰의 최종 은닉 상태를 통과한 결과) 사용
+        _, pooler = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(), attention_mask=attention_mask.float().to(token_ids.device), return_dict=False)
+        if self.dr_rate:
+            out = self.dropout(pooler)
+        else:
+            out = pooler
+        return self.classifier(out)
+# --- 2. BERTDataset 클래스 정의 (dataset.py에서 옮겨옴) ---
+# 이 클래스는 데이터를 모델 입력 형식으로 변환합니다.
+class BERTDataset(Dataset):
+    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len, pad, pair):
+        transform = nlp.data.BERTSentenceTransform(
+            bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair
+        )
+        self.sentences = [transform([i[sent_idx]]) for i in dataset]
+        self.labels = [np.int32(i[label_idx]) for i in dataset]
+    def __getitem__(self, i):
+        return (self.sentences[i] + (self.labels[i],))
+    def __len__(self):
+        return len(self.labels)
+# --- 3. FastAPI 앱 및 전역 변수 설정 ---
 app = FastAPI()
+device = torch.device("cpu") # Render의 무료 티어는 주로 CPU를 사용합니다.
+# ✅ category 로드 (GitHub 저장소 루트에 있어야 함)
+try:
+    with open("category.pkl", "rb") as f:
+        category = pickle.load(f)
+    print("category.pkl 로드 성공.")
+except FileNotFoundError:
+    print("Error: category.pkl 파일을 찾을 수 없습니다. 프로젝트 루트에 있는지 확인하세요.")
+    sys.exit(1) # 파일 없으면 서비스 시작하지 않음
+# ✅ vocab 로드 (GitHub 저장소 루트에 있어야 함)
+try:
+    with open("vocab.pkl", "rb") as f:
+        vocab = pickle.load(f)
+    print("vocab.pkl 로드 성공.")
+except FileNotFoundError:
+    print("Error: vocab.pkl 파일을 찾을 수 없습니다. 프로젝트 루트에 있는지 확인하세요.")
+    sys.exit(1) # 파일 없으면 서비스 시작하지 않음
+# ✅ 토크나이저 로드 (kobert_tokenizer 사용)
+# Colab 코드에서 사용된 방식이므로 유지합니다.
 tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
+print("토크나이저 로드 성공.")
 # ✅ 모델 로드
+# 모델 아키텍처를 정의하고, 저장된 state_dict를 로드합니다.
+# num_classes는 category 딕셔너리의 크기와 일치해야 합니다.
 model = BERTClassifier(
     BertModel.from_pretrained('skt/kobert-base-v1'),
+    dr_rate=0.5, # 학습 시 사용된 dr_rate 값으로 변경하세요.
     num_classes=len(category)
 )
+# textClassifierModel.pt 파일 로드
+# 이 파일은 GitHub 저장소에 없어야 하며, Dockerfile에서 Hugging Face Hub에서 다운로드하도록 설정되어 있습니다.
+try:
+    # Dockerfile에서 모델을 다운로드하도록 설정했으므로, 여기서는 로컬 경로를 사용합니다.
+    # 만약 Dockerfile에서 hf_hub_download를 사용하지 않는다면, 여기에 hf_hub_download를 추가해야 합니다.
+    # 현재 Dockerfile은 git+https://github.com/SKTBrain/KOBERT#egg=kobert_tokenizer 로드만 포함하고,
+    # 모델 파일 다운로드는 포함하지 않습니다.
+    # 따라서, 모델 파일을 Hugging Face Hub에서 다운로드하는 로직을 다시 추가해야 합니다.
+    from huggingface_hub import hf_hub_download
+    HF_MODEL_REPO_ID = "hiddenFront/TextClassifier" # 사용자님의 실제 Hugging Face 저장소 ID
+    HF_MODEL_FILENAME = "textClassifierModel.pt"
+    model_path = hf_hub_download(repo_id=HF_MODEL_REPO_ID, filename=HF_MODEL_FILENAME)
+    print(f"모델 파일이 '{model_path}'에 성공적으로 다운로드되었습니다.")
+    # 모델의 state_dict를 로드합니다.
+    loaded_state_dict = torch.load(model_path, map_location=device)
+    # state_dict 키 조정 (필요한 경우)
+    new_state_dict = collections.OrderedDict()
+    for k, v in loaded_state_dict.items():
+        name = k
+        if name.startswith('module.'):
+            name = name[7:]
+        new_state_dict[name] = v
+    model.load_state_dict(new_state_dict)
+    model.to(device) # 모델을 디바이스로 이동
+    model.eval() # 추론 모드로 설정
+    print("모델 로드 성공.")
+except Exception as e:
+    print(f"Error: 모델 다운로드 또는 로드 중 오류 발생: {e}")
+    sys.exit(1) # 모델 로드 실패 시 서비스 시작하지 않음
 # ✅ 데이터셋 생성에 필요한 파라미터
 max_len = 64
 def predict(predict_sentence):
     data = [predict_sentence, '0']
     dataset_another = [data]
+    # num_workers는 배포 환경에서 0으로 설정 권장
     another_test = BERTDataset(dataset_another, 0, 1, tokenizer, vocab, max_len, True, False)
+    test_dataLoader = DataLoader(another_test, batch_size=batch_size, num_workers=0)
+    model.eval() # 예측 시 모델을 평가 모드로 설정
+    with torch.no_grad(): # 그라디언트 계산 비활성화
+        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataLoader):
+            token_ids = token_ids.long().to(device)
+            segment_ids = segment_ids.long().to(device)
+            out = model(token_ids, valid_length, segment_ids)
+            logits = out
+            logits = logits.detach().cpu().numpy()
+            predicted_category_index = np.argmax(logits)
+            predicted_category_name = list(category.keys())[predicted_category_index]
+            return predicted_category_name
 # ✅ 엔드포인트 정의
 class InputText(BaseModel):
 async def predict_route(item: InputText):
     result = predict(item.text)
     return {"text": item.text, "classification": result}