Spaces:

WildOjisan
/

python_roberta_hf

Runtime error

python_roberta_hf / kobert_test.py

899f482 about 1 month ago

2.41 kB

	import torch
	import numpy as np
	from transformers import AutoModel
	# KoBERT 전용 토크나이저 로드 (Hugging Face 토크나이저와 다름)
	from kobert_tokenizer import KoBERTTokenizer

	# 1. GPU/CPU 장치 설정
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"사용 장치: {device}")

	# 2. 모델 및 토크나이저 로드 (추가 수정)
	MODEL_NAME = "monologg/kobert"
	# 토크나이저를 로드할 때 'monologg/kobert' 대신
	# SKT Brain의 공식 저장소 이름인 'skt/kobert-base-v1'을 사용하는 것이 더 안정적입니다.
	tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
	model = AutoModel.from_pretrained(MODEL_NAME)

	# 모델을 설정된 장치(GPU 또는 CPU)로 이동
	model.to(device)

	# 3. 임베딩(Embedding) 추출 함수 정의
	def get_kobert_embedding(text):
	# 텍스트 토큰화 및 입력 형식으로 변환
	inputs = tokenizer.batch_encode_plus(
	[text], # 리스트 형태로 입력
	padding='max_length',
	max_length=64, # 최대 길이 지정 (필요에 따라 조정)
	truncation=True,
	return_tensors="pt" # PyTorch 텐서로 반환
	).to(device)

	# 모델 추론 (Inference)
	with torch.no_grad():
	# output에는 last_hidden_state (각 토큰의 임베딩) 등이 포함됩니다.
	outputs = model(**inputs)

	# 문장 임베딩 추출: [CLS] 토큰의 임베딩을 사용합니다.
	# last_hidden_state의 첫 번째 토큰 (인덱스 0)이 [CLS] 토큰이며, 전체 문장을 대표합니다.
	# shape: (1, 768)
	sentence_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

	return sentence_embedding[0] # numpy 배열 (768차원)로 반환

	# 4. 당근마켓 리뷰 예제 실행
	review_sentences = [
	"판매자님 매너가 너무 좋아서 기분 좋은 거래였습니다.",
	"물건 상태가 생각보다 별로여서 아쉽네요. 다음엔 거래 안 할 것 같아요.",
	"이 자전거 모델은 중고 시세가 어느 정도일까요?",
	]

	print("\n--- KoBERT 임베딩 추출 결과 ---")
	for sentence in review_sentences:
	embedding = get_kobert_embedding(sentence)

	print(f"문장: '{sentence}'")
	print(f" -> 임베딩 차원: {embedding.shape}") # 768차원
	print(f" -> 임베딩 벡터 일부 (첫 5개): {embedding[:5].round(4)}")
	print("-" * 30)