hiddenFront commited on
Commit
1efa28d
ยท
verified ยท
1 Parent(s): e2b01db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -82
app.py CHANGED
@@ -5,68 +5,19 @@ import pickle
5
  import gluonnlp as nlp
6
  import numpy as np
7
  import os
8
- import sys # sys ๋ชจ๋“ˆ ์ž„ํฌํŠธ ์ถ”๊ฐ€ (NameError ํ•ด๊ฒฐ)
9
 
10
- # KoBERTTokenizer ๋Œ€์‹  transformers.AutoTokenizer ์‚ฌ์šฉ
11
- from transformers import BertModel, AutoTokenizer # AutoTokenizer ์ž„ํฌํŠธ ์œ ์ง€
12
  from torch.utils.data import Dataset, DataLoader
13
  import logging # ๋กœ๊น… ๋ชจ๋“ˆ ์ž„ํฌํŠธ ์œ ์ง€
14
- from huggingface_hub import hf_hub_download # hf_hub_download ์ž„ํฌํŠธ ์ถ”๊ฐ€
15
- import collections # collections ๋ชจ๋“ˆ ์ž„ํฌํŠธ ์œ ์ง€
16
-
17
- # --- 1. BERTClassifier ๋ชจ๋ธ ํด๋ž˜์Šค ์ •์˜ (model.py์—์„œ ์˜ฎ๊ฒจ์˜ด) ---
18
- class BERTClassifier(torch.nn.Module):
19
- def __init__(self,
20
- bert,
21
- hidden_size = 768,
22
- num_classes=5, # ๋ถ„๋ฅ˜ํ•  ํด๋ž˜์Šค ์ˆ˜ (category ๋”•์…”๋„ˆ๋ฆฌ ํฌ๊ธฐ์™€ ์ผ์น˜)
23
- dr_rate=None,
24
- params=None):
25
- super(BERTClassifier, self).__init__()
26
- self.bert = bert
27
- self.dr_rate = dr_rate
28
-
29
- self.classifier = torch.nn.Linear(hidden_size , num_classes)
30
- if dr_rate:
31
- self.dropout = torch.nn.Dropout(p=dr_rate)
32
-
33
- def gen_attention_mask(self, token_ids, valid_length):
34
- attention_mask = torch.zeros_like(token_ids)
35
- for i, v in enumerate(valid_length):
36
- attention_mask[i][:v] = 1
37
- return attention_mask.float()
38
-
39
- def forward(self, token_ids, valid_length, segment_ids):
40
- attention_mask = self.gen_attention_mask(token_ids, valid_length)
41
-
42
- _, pooler = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(), attention_mask=attention_mask.float().to(token_ids.device), return_dict=False)
43
-
44
- if self.dr_rate:
45
- out = self.dropout(pooler)
46
- else:
47
- out = pooler
48
- return self.classifier(out)
49
 
50
- # --- 2. BERTDataset ํด๋ž˜์Šค ์ •์˜ (dataset.py์—์„œ ์˜ฎ๊ฒจ์˜ด) ---
51
- class BERTDataset(Dataset):
52
- def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len, pad, pair):
53
- # nlp.data.BERTSentenceTransform์€ ํ† ํฌ๋‚˜์ด์ € ํ•จ์ˆ˜๋ฅผ ๋ฐ›์Šต๋‹ˆ๋‹ค.
54
- # AutoTokenizer์˜ tokenize ๋ฉ”์„œ๋“œ๋ฅผ ์ง์ ‘ ์ „๋‹ฌํ•ฉ๋‹ˆ๋‹ค.
55
- transform = nlp.data.BERTSentenceTransform(
56
- bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair
57
- )
58
- self.sentences = [transform([i[sent_idx]]) for i in dataset]
59
- self.labels = [np.int32(i[label_idx]) for i in dataset]
60
-
61
- def __getitem__(self, i):
62
- return (self.sentences[i] + (self.labels[i],))
63
-
64
- def __len__(self):
65
- return len(self.labels)
66
-
67
- # --- 3. FastAPI ์•ฑ ๋ฐ ์ „์—ญ ๋ณ€์ˆ˜ ์„ค์ • ---
68
  app = FastAPI()
69
- device = torch.device("cpu") # Render์˜ ๋ฌด๋ฃŒ ํ‹ฐ์–ด๋Š” ์ฃผ๋กœ CPU๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
70
 
71
  # โœ… category ๋กœ๋“œ (GitHub ์ €์žฅ์†Œ ๋ฃจํŠธ์— ์žˆ์–ด์•ผ ํ•จ)
72
  try:
@@ -87,38 +38,24 @@ except FileNotFoundError:
87
  sys.exit(1) # ํŒŒ์ผ ์—†์œผ๋ฉด ์„œ๋น„์Šค ์‹œ์ž‘ํ•˜์ง€ ์•Š์Œ
88
 
89
  # โœ… ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ (transformers.AutoTokenizer ์‚ฌ์šฉ)
90
- # KoBERTTokenizer ๋Œ€์‹  AutoTokenizer๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ KoBERT ๋ชจ๋ธ์˜ ํ† ํฌ๋‚˜์ด์ €๋ฅผ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
91
- # ์ด๋ ‡๊ฒŒ ํ•˜๋ฉด XLNetTokenizer ๊ฒฝ๊ณ  ๋ฐ kobert_tokenizer ์„ค์น˜ ๋ฌธ์ œ๋ฅผ ํ”ผํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
92
  tokenizer = AutoTokenizer.from_pretrained('skt/kobert-base-v1')
93
  print("ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ ์„ฑ๊ณต.")
94
 
95
- # โœ… ๋ชจ๋ธ ๋กœ๋“œ
96
- # num_classes๋Š” category ๋”•์…”๋„ˆ๋ฆฌ์˜ ํฌ๊ธฐ์™€ ์ผ์น˜ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
97
- bertmodel = BertModel.from_pretrained('skt/kobert-base-v1')
98
- model = BERTClassifier(
99
- bertmodel,
100
- dr_rate=0.5, # ํ•™์Šต ์‹œ ์‚ฌ์šฉ๋œ dr_rate ๊ฐ’์œผ๋กœ ๋ณ€๊ฒฝํ•˜์„ธ์š”.
101
- num_classes=len(category)
102
- )
103
-
104
- # textClassifierModel.pt ํŒŒ์ผ ๋กœ๋“œ
105
  try:
106
  HF_MODEL_REPO_ID = "hiddenFront/TextClassifier" # ์‚ฌ์šฉ์ž๋‹˜์˜ ์‹ค์ œ Hugging Face ์ €์žฅ์†Œ ID
107
- HF_MODEL_FILENAME = "textClassifierModel.pt"
 
108
  model_path = hf_hub_download(repo_id=HF_MODEL_REPO_ID, filename=HF_MODEL_FILENAME)
109
  print(f"๋ชจ๋ธ ํŒŒ์ผ์ด '{model_path}'์— ์„ฑ๊ณต์ ์œผ๋กœ ๋‹ค์šด๋กœ๋“œ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
110
 
111
- loaded_state_dict = torch.load(model_path, map_location=device)
112
-
113
- new_state_dict = collections.OrderedDict()
114
- for k, v in loaded_state_dict.items():
115
- name = k
116
- if name.startswith('module.'):
117
- name = name[7:]
118
- new_state_dict[name] = v
119
-
120
- model.load_state_dict(new_state_dict)
121
- model.to(device) # ๋ชจ๋ธ์„ ๋””๋ฐ”์ด์Šค๋กœ ์ด๋™
122
  model.eval() # ์ถ”๋ก  ๋ชจ๋“œ๋กœ ์„ค์ •
123
  print("๋ชจ๋ธ ๋กœ๋“œ ์„ฑ๊ณต.")
124
 
@@ -127,6 +64,25 @@ except Exception as e:
127
  sys.exit(1) # ๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ ์‹œ ์„œ๋น„์Šค ์‹œ์ž‘ํ•˜์ง€ ์•Š์Œ
128
 
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  # โœ… ๋ฐ์ดํ„ฐ์…‹ ์ƒ์„ฑ์— ํ•„์š”ํ•œ ํŒŒ๋ผ๋ฏธํ„ฐ
131
  max_len = 64
132
  batch_size = 32
@@ -136,7 +92,8 @@ def predict(predict_sentence):
136
  data = [predict_sentence, '0']
137
  dataset_another = [data]
138
  # num_workers๋Š” ๋ฐฐํฌ ํ™˜๊ฒฝ์—์„œ 0์œผ๋กœ ์„ค์ • ๊ถŒ์žฅ
139
- another_test = BERTDataset(dataset_another, 0, 1, tokenizer, vocab, max_len, True, False) # tokenizer ๊ฐ์ฒด ์ง์ ‘ ์ „๋‹ฌ
 
140
  test_dataLoader = DataLoader(another_test, batch_size=batch_size, num_workers=0)
141
 
142
  model.eval() # ์˜ˆ์ธก ์‹œ ๋ชจ๋ธ์„ ํ‰๊ฐ€ ๋ชจ๋“œ๋กœ ์„ค์ •
 
5
  import gluonnlp as nlp
6
  import numpy as np
7
  import os
8
+ import sys # ์˜ค๋ฅ˜ ์‹œ ์„œ๋น„์Šค ์ข…๋ฃŒ๋ฅผ ์œ„ํ•ด sys ๋ชจ๋“ˆ ์ž„ํฌํŠธ
9
 
10
+ # transformers์˜ AutoTokenizer๋งŒ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
11
+ from transformers import AutoTokenizer # BertModel, BertForSequenceClassification ๋“ฑ์€ ์ด์ œ ์ง์ ‘ ํ•„์š” ์—†์Šต๋‹ˆ๋‹ค.
12
  from torch.utils.data import Dataset, DataLoader
13
  import logging # ๋กœ๊น… ๋ชจ๋“ˆ ์ž„ํฌํŠธ ์œ ์ง€
14
+ from huggingface_hub import hf_hub_download # hf_hub_download ์ž„ํฌํŠธ ์œ ์ง€
15
+ # collections ๋ชจ๋“ˆ์€ ๋” ์ด์ƒ ํ•„์š” ์—†์„ ์ˆ˜ ์žˆ์ง€๋งŒ, ํ˜น์‹œ ๋ชฐ๋ผ ์œ ์ง€ํ•ฉ๋‹ˆ๋‹ค.
16
+ import collections
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ # --- 1. FastAPI ์•ฑ ๋ฐ ์ „์—ญ ๋ณ€์ˆ˜ ์„ค์ • ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  app = FastAPI()
20
+ device = torch.device("cpu") # Hugging Face Spaces์˜ ๋ฌด๋ฃŒ ํ‹ฐ์–ด๋Š” ์ฃผ๋กœ CPU๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
21
 
22
  # โœ… category ๋กœ๋“œ (GitHub ์ €์žฅ์†Œ ๋ฃจํŠธ์— ์žˆ์–ด์•ผ ํ•จ)
23
  try:
 
38
  sys.exit(1) # ํŒŒ์ผ ์—†์œผ๋ฉด ์„œ๋น„์Šค ์‹œ์ž‘ํ•˜์ง€ ์•Š์Œ
39
 
40
  # โœ… ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ (transformers.AutoTokenizer ์‚ฌ์šฉ)
 
 
41
  tokenizer = AutoTokenizer.from_pretrained('skt/kobert-base-v1')
42
  print("ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ ์„ฑ๊ณต.")
43
 
44
+ # โœ… ๋ชจ๋ธ ๋กœ๋“œ (Hugging Face Hub์—์„œ ๋‹ค์šด๋กœ๋“œ)
45
+ # textClassifierModel.pt ํŒŒ์ผ์€ ์ด๋ฏธ ๊ฒฝ๋Ÿ‰ํ™”๋œ '์™„์ „ํ•œ ๋ชจ๋ธ ๊ฐ์ฒด'๋ผ๊ณ  ๊ฐ€์ •ํ•˜๊ณ  ์ง์ ‘ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
 
 
 
 
 
 
 
 
46
  try:
47
  HF_MODEL_REPO_ID = "hiddenFront/TextClassifier" # ์‚ฌ์šฉ์ž๋‹˜์˜ ์‹ค์ œ Hugging Face ์ €์žฅ์†Œ ID
48
+ HF_MODEL_FILENAME = "textClassifierModel.pt" # Hugging Face Hub์— ์—…๋กœ๋“œํ•œ ํŒŒ์ผ ์ด๋ฆ„๊ณผ ์ผ์น˜ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
49
+
50
  model_path = hf_hub_download(repo_id=HF_MODEL_REPO_ID, filename=HF_MODEL_FILENAME)
51
  print(f"๋ชจ๋ธ ํŒŒ์ผ์ด '{model_path}'์— ์„ฑ๊ณต์ ์œผ๋กœ ๋‹ค์šด๋กœ๋“œ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
52
 
53
+ # --- ์ˆ˜์ •๋œ ํ•ต์‹ฌ ๋ถ€๋ถ„ ---
54
+ # ๊ฒฝ๋Ÿ‰ํ™”๋œ ๋ชจ๋ธ ๊ฐ์ฒด๋ฅผ ์ง์ ‘ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.
55
+ # ์ด ํŒŒ์ผ์€ ์ด๋ฏธ PyTorch ๋ชจ๋ธ ๊ฐ์ฒด(์–‘์žํ™”๋œ ๋ชจ๋ธ ํฌํ•จ)์ด๋ฏ€๋กœ ๋ฐ”๋กœ ๋กœ๋“œํ•˜์—ฌ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
56
+ model = torch.load(model_path, map_location=device)
57
+ # --- ์ˆ˜์ •๋œ ํ•ต์‹ฌ ๋ถ€๋ถ„ ๋ ---
58
+
 
 
 
 
 
59
  model.eval() # ์ถ”๋ก  ๋ชจ๋“œ๋กœ ์„ค์ •
60
  print("๋ชจ๋ธ ๋กœ๋“œ ์„ฑ๊ณต.")
61
 
 
64
  sys.exit(1) # ๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ ์‹œ ์„œ๋น„์Šค ์‹œ์ž‘ํ•˜์ง€ ์•Š์Œ
65
 
66
 
67
+ # --- 2. BERTDataset ํด๋ž˜์Šค ์ •์˜ (dataset.py์—์„œ ์˜ฎ๊ฒจ์˜ด) ---
68
+ # ์ด ํด๋ž˜์Šค๋Š” ๋ฐ์ดํ„ฐ๋ฅผ ๋ชจ๋ธ ์ž…๋ ฅ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
69
+ class BERTDataset(Dataset):
70
+ def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len, pad, pair):
71
+ # nlp.data.BERTSentenceTransform์€ ํ† ํฌ๋‚˜์ด์ € ํ•จ์ˆ˜๋ฅผ ๋ฐ›์Šต๋‹ˆ๋‹ค.
72
+ # AutoTokenizer์˜ tokenize ๋ฉ”์„œ๋“œ๋ฅผ ์ง์ ‘ ์ „๋‹ฌํ•ฉ๋‹ˆ๋‹ค.
73
+ transform = nlp.data.BERTSentenceTransform(
74
+ bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair
75
+ )
76
+ self.sentences = [transform([i[sent_idx]]) for i in dataset]
77
+ self.labels = [np.int32(i[label_idx]) for i in dataset]
78
+
79
+ def __getitem__(self, i):
80
+ return (self.sentences[i] + (self.labels[i],))
81
+
82
+ def __len__(self):
83
+ return len(self.labels)
84
+
85
+
86
  # โœ… ๋ฐ์ดํ„ฐ์…‹ ์ƒ์„ฑ์— ํ•„์š”ํ•œ ํŒŒ๋ผ๋ฏธํ„ฐ
87
  max_len = 64
88
  batch_size = 32
 
92
  data = [predict_sentence, '0']
93
  dataset_another = [data]
94
  # num_workers๋Š” ๋ฐฐํฌ ํ™˜๊ฒฝ์—์„œ 0์œผ๋กœ ์„ค์ • ๊ถŒ์žฅ
95
+ # tokenizer.tokenize๋ฅผ BERTDataset์— ์ „๋‹ฌํ•ฉ๋‹ˆ๋‹ค.
96
+ another_test = BERTDataset(dataset_another, 0, 1, tokenizer.tokenize, vocab, max_len, True, False)
97
  test_dataLoader = DataLoader(another_test, batch_size=batch_size, num_workers=0)
98
 
99
  model.eval() # ์˜ˆ์ธก ์‹œ ๋ชจ๋ธ์„ ํ‰๊ฐ€ ๋ชจ๋“œ๋กœ ์„ค์ •