File size: 2,410 Bytes
899f482
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import torch
import numpy as np
from transformers import AutoModel
# KoBERT ์ „์šฉ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ (Hugging Face ํ† ํฌ๋‚˜์ด์ €์™€ ๋‹ค๋ฆ„)
from kobert_tokenizer import KoBERTTokenizer

# 1. GPU/CPU ์žฅ์น˜ ์„ค์ •
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"์‚ฌ์šฉ ์žฅ์น˜: {device}")

# 2. ๋ชจ๋ธ ๋ฐ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ (์ถ”๊ฐ€ ์ˆ˜์ •)
MODEL_NAME = "monologg/kobert"
# ํ† ํฌ๋‚˜์ด์ €๋ฅผ ๋กœ๋“œํ•  ๋•Œ 'monologg/kobert' ๋Œ€์‹  
# SKT Brain์˜ ๊ณต์‹ ์ €์žฅ์†Œ ์ด๋ฆ„์ธ 'skt/kobert-base-v1'์„ ์‚ฌ์šฉํ•˜๋Š” ๊ฒƒ์ด ๋” ์•ˆ์ •์ ์ž…๋‹ˆ๋‹ค.
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1') 
model = AutoModel.from_pretrained(MODEL_NAME)

# ๋ชจ๋ธ์„ ์„ค์ •๋œ ์žฅ์น˜(GPU ๋˜๋Š” CPU)๋กœ ์ด๋™
model.to(device)

# 3. ์ž„๋ฒ ๋”ฉ(Embedding) ์ถ”์ถœ ํ•จ์ˆ˜ ์ •์˜
def get_kobert_embedding(text):
    # ํ…์ŠคํŠธ ํ† ํฐํ™” ๋ฐ ์ž…๋ ฅ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜
    inputs = tokenizer.batch_encode_plus(
        [text], # ๋ฆฌ์ŠคํŠธ ํ˜•ํƒœ๋กœ ์ž…๋ ฅ
        padding='max_length',
        max_length=64, # ์ตœ๋Œ€ ๊ธธ์ด ์ง€์ • (ํ•„์š”์— ๋”ฐ๋ผ ์กฐ์ •)
        truncation=True,
        return_tensors="pt" # PyTorch ํ…์„œ๋กœ ๋ฐ˜ํ™˜
    ).to(device)
    
    # ๋ชจ๋ธ ์ถ”๋ก  (Inference)
    with torch.no_grad():
        # output์—๋Š” last_hidden_state (๊ฐ ํ† ํฐ์˜ ์ž„๋ฒ ๋”ฉ) ๋“ฑ์ด ํฌํ•จ๋ฉ๋‹ˆ๋‹ค.
        outputs = model(**inputs)
        
    # ๋ฌธ์žฅ ์ž„๋ฒ ๋”ฉ ์ถ”์ถœ: [CLS] ํ† ํฐ์˜ ์ž„๋ฒ ๋”ฉ์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
    # last_hidden_state์˜ ์ฒซ ๋ฒˆ์งธ ํ† ํฐ (์ธ๋ฑ์Šค 0)์ด [CLS] ํ† ํฐ์ด๋ฉฐ, ์ „์ฒด ๋ฌธ์žฅ์„ ๋Œ€ํ‘œํ•ฉ๋‹ˆ๋‹ค.
    # shape: (1, 768)
    sentence_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    
    return sentence_embedding[0] # numpy ๋ฐฐ์—ด (768์ฐจ์›)๋กœ ๋ฐ˜ํ™˜

# 4. ๋‹น๊ทผ๋งˆ์ผ“ ๋ฆฌ๋ทฐ ์˜ˆ์ œ ์‹คํ–‰
review_sentences = [
    "ํŒ๋งค์ž๋‹˜ ๋งค๋„ˆ๊ฐ€ ๋„ˆ๋ฌด ์ข‹์•„์„œ ๊ธฐ๋ถ„ ์ข‹์€ ๊ฑฐ๋ž˜์˜€์Šต๋‹ˆ๋‹ค.",
    "๋ฌผ๊ฑด ์ƒํƒœ๊ฐ€ ์ƒ๊ฐ๋ณด๋‹ค ๋ณ„๋กœ์—ฌ์„œ ์•„์‰ฝ๋„ค์š”. ๋‹ค์Œ์—” ๊ฑฐ๋ž˜ ์•ˆ ํ•  ๊ฒƒ ๊ฐ™์•„์š”.",
    "์ด ์ž์ „๊ฑฐ ๋ชจ๋ธ์€ ์ค‘๊ณ  ์‹œ์„ธ๊ฐ€ ์–ด๋А ์ •๋„์ผ๊นŒ์š”?",
]

print("\n--- KoBERT ์ž„๋ฒ ๋”ฉ ์ถ”์ถœ ๊ฒฐ๊ณผ ---")
for sentence in review_sentences:
    embedding = get_kobert_embedding(sentence)
    
    print(f"๋ฌธ์žฅ: '{sentence}'")
    print(f"  -> ์ž„๋ฒ ๋”ฉ ์ฐจ์›: {embedding.shape}") # 768์ฐจ์›
    print(f"  -> ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ ์ผ๋ถ€ (์ฒซ 5๊ฐœ): {embedding[:5].round(4)}")
    print("-" * 30)