python_roberta_hf / kobert_test.py
WildOjisan's picture
.
899f482
import torch
import numpy as np
from transformers import AutoModel
# KoBERT ์ „์šฉ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ (Hugging Face ํ† ํฌ๋‚˜์ด์ €์™€ ๋‹ค๋ฆ„)
from kobert_tokenizer import KoBERTTokenizer
# 1. GPU/CPU ์žฅ์น˜ ์„ค์ •
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"์‚ฌ์šฉ ์žฅ์น˜: {device}")
# 2. ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ (์ถ”๊ฐ€ ์ˆ˜์ •)
MODEL_NAME = "monologg/kobert"
# ํ† ํฌ๋‚˜์ด์ €๋ฅผ ๋กœ๋“œํ•  ๋•Œ 'monologg/kobert' ๋Œ€์‹ 
# SKT Brain์˜ ๊ณต์‹ ์ €์žฅ์†Œ ์ด๋ฆ„์ธ 'skt/kobert-base-v1'์„ ์‚ฌ์šฉํ•˜๋Š” ๊ฒƒ์ด ๋” ์•ˆ์ •์ ์ž…๋‹ˆ๋‹ค.
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
model = AutoModel.from_pretrained(MODEL_NAME)
# ๋ชจ๋ธ์„ ์„ค์ •๋œ ์žฅ์น˜(GPU ๋˜๋Š” CPU)๋กœ ์ด๋™
model.to(device)
# 3. ์ž„๋ฒ ๋”ฉ(Embedding) ์ถ”์ถœ ํ•จ์ˆ˜ ์ •์˜
def get_kobert_embedding(text):
# ํ…์ŠคํŠธ ํ† ํฐํ™” ๋ฐ ์ž…๋ ฅ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜
inputs = tokenizer.batch_encode_plus(
[text], # ๋ฆฌ์ŠคํŠธ ํ˜•ํƒœ๋กœ ์ž…๋ ฅ
padding='max_length',
max_length=64, # ์ตœ๋Œ€ ๊ธธ์ด ์ง€์ • (ํ•„์š”์— ๋”ฐ๋ผ ์กฐ์ •)
truncation=True,
return_tensors="pt" # PyTorch ํ…์„œ๋กœ ๋ฐ˜ํ™˜
).to(device)
# ๋ชจ๋ธ ์ถ”๋ก  (Inference)
with torch.no_grad():
# output์—๋Š” last_hidden_state (๊ฐ ํ† ํฐ์˜ ์ž„๋ฒ ๋”ฉ) ๋“ฑ์ด ํฌํ•จ๋ฉ๋‹ˆ๋‹ค.
outputs = model(**inputs)
# ๋ฌธ์žฅ ์ž„๋ฒ ๋”ฉ ์ถ”์ถœ: [CLS] ํ† ํฐ์˜ ์ž„๋ฒ ๋”ฉ์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
# last_hidden_state์˜ ์ฒซ ๋ฒˆ์งธ ํ† ํฐ (์ธ๋ฑ์Šค 0)์ด [CLS] ํ† ํฐ์ด๋ฉฐ, ์ „์ฒด ๋ฌธ์žฅ์„ ๋Œ€ํ‘œํ•ฉ๋‹ˆ๋‹ค.
# shape: (1, 768)
sentence_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
return sentence_embedding[0] # numpy ๋ฐฐ์—ด (768์ฐจ์›)๋กœ ๋ฐ˜ํ™˜
# 4. ๋‹น๊ทผ๋งˆ์ผ“ ๋ฆฌ๋ทฐ ์˜ˆ์ œ ์‹คํ–‰
review_sentences = [
"ํŒ๋งค์ž๋‹˜ ๋งค๋„ˆ๊ฐ€ ๋„ˆ๋ฌด ์ข‹์•„์„œ ๊ธฐ๋ถ„ ์ข‹์€ ๊ฑฐ๋ž˜์˜€์Šต๋‹ˆ๋‹ค.",
"๋ฌผ๊ฑด ์ƒํƒœ๊ฐ€ ์ƒ๊ฐ๋ณด๋‹ค ๋ณ„๋กœ์—ฌ์„œ ์•„์‰ฝ๋„ค์š”. ๋‹ค์Œ์—” ๊ฑฐ๋ž˜ ์•ˆ ํ•  ๊ฒƒ ๊ฐ™์•„์š”.",
"์ด ์ž์ „๊ฑฐ ๋ชจ๋ธ์€ ์ค‘๊ณ  ์‹œ์„ธ๊ฐ€ ์–ด๋А ์ •๋„์ผ๊นŒ์š”?",
]
print("\n--- KoBERT ์ž„๋ฒ ๋”ฉ ์ถ”์ถœ ๊ฒฐ๊ณผ ---")
for sentence in review_sentences:
embedding = get_kobert_embedding(sentence)
print(f"๋ฌธ์žฅ: '{sentence}'")
print(f" -> ์ž„๋ฒ ๋”ฉ ์ฐจ์›: {embedding.shape}") # 768์ฐจ์›
print(f" -> ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ์ผ๋ถ€ (์ฒซ 5๊ฐœ): {embedding[:5].round(4)}")
print("-" * 30)