Spaces:

gkdud00
/

project-tdm

Sleeping

project-tdm / mismatch_model.py

Fix server code and sync with remote

7134b06 20 days ago

4.88 kB

	import torch
	import torch.nn.functional as F
	from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
	from sentence_transformers import SentenceTransformer, util

	# 디바이스 설정 (GPU 우선, 없으면 CPU)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"✅ 현재 실행 환경: {device}")

	# =============================================================================
	# 2. 모델 로드 (시간이 조금 걸릴 수 있습니다)
	# =============================================================================
	print("\n⏳ [1/3] KoBART 요약 모델 로딩 중...")
	kobart_summarizer = pipeline(
	"summarization",
	model="gogamza/kobart-summarization",
	device=0 if torch.cuda.is_available() else -1
	)

	print("⏳ [2/3] SBERT 유사도 모델 로딩 중...")
	sbert_model = SentenceTransformer('jhgan/ko-sroberta-multitask')

	print("⏳ [3/3] NLI(모순 탐지) 모델 로딩 중...")
	nli_model_name = "Huffon/klue-roberta-base-nli"
	nli_tokenizer = AutoTokenizer.from_pretrained(nli_model_name)
	nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name).to(device)

	print("🎉 모든 모델 로드 완료!\n")

	# =============================================================================
	# 3. 도우미 함수 정의 (Worker Functions)
	# =============================================================================

	def summarize_kobart_strict(text):
	"""KoBART를 사용하여 본문을 요약합니다."""
	# 본문이 너무 짧으면 요약 생략 (오류 방지)
	if len(text) < 50:
	return text

	try:
	result = kobart_summarizer(
	text,
	min_length=15,
	max_length=128,
	num_beams=4,
	no_repeat_ngram_size=3,
	early_stopping=True
	)[0]['summary_text']
	return result.strip()
	except Exception as e:
	return text[:100] # 실패 시 앞부분 반환

	def get_cosine_similarity(title, summary):
	"""SBERT로 제목과 요약문의 코사인 유사도를 계산합니다."""
	emb1 = sbert_model.encode(title, convert_to_tensor=True)
	emb2 = sbert_model.encode(summary, convert_to_tensor=True)
	return util.cos_sim(emb1, emb2).item()

	def get_mismatch_score(summary, title):
	"""NLI 모델로 요약문(전제)과 제목(가설) 사이의 모순 확률을 계산합니다."""
	inputs = nli_tokenizer(
	summary, title,
	return_tensors='pt',
	truncation=True,
	max_length=512
	).to(device)

	# RoBERTa 모델 에러 방지 (token_type_ids 제거)
	if "token_type_ids" in inputs:
	del inputs["token_type_ids"]

	with torch.no_grad():
	outputs = nli_model(**inputs)
	probs = F.softmax(outputs.logits, dim=-1)[0]

	# Huffon/klue-roberta-base-nli 라벨 순서: [Entailment, Neutral, Contradiction]
	# 모순(Contradiction) 확률 반환 (Index 2)
	return round(probs[2].item(), 4)

	# =============================================================================
	# 4. 최종 메인 함수 (Main Logic)
	# =============================================================================

	def calculate_mismatch_score(article_title, article_body):
	"""
	Grid Search 결과 최적 가중치 적용:
	- w1 (SBERT, 의미적 거리): 0.8
	- w2 (NLI, 논리적 모순): 0.2
	- Threshold (임계값): 0.45 이상이면 '위험'
	"""
	#if not (kobart_summarizer and sbert_model and nli_model):
	# return {"score": 0.0, "reason": "모델 로딩 실패", "recommendation": "서버 확인 필요"}

	# 1. 본문 요약
	summary = summarize_kobart_strict(article_body)

	# 2. SBERT 의미적 거리 (1 - 유사도)
	sbert_sim = get_cosine_similarity(article_title, summary)
	semantic_distance = 1 - sbert_sim

	# 3. NLI 논리적 모순 확률
	nli_contradiction = get_mismatch_score(summary, article_title)

	# 4. 최종 점수 산출
	w1, w2 = 0.8, 0.2
	final_score = (w1 * semantic_distance) + (w2 * nli_contradiction)
	reason = (
	f"[디버그 모드]\n"
	f"1. 요약문: {summary}\n"
	f"2. SBERT 거리: {semantic_distance:.4f}\n"
	f"3. NLI 모순: {nli_contradiction:.4f}"
	)

	#reason = f"제목과 본문의 의미적 거리({semantic_distance:.4f})와 모순 확률({nli_contradiction:.4f})이 반영되었습니다."

	# 5. 결과 판정 (Threshold 0.45 기준)
	if final_score >= 0.45:
	recommendation = "제목이 본문의 내용을 왜곡하거나 모순될 가능성이 높습니다."
	else:
	recommendation = "제목과 본문의 내용이 일치합니다."

	# main.py로 전달할 데이터
	return {
	"score": round(final_score, 4),
	"reason": reason,
	"recommendation": recommendation
	}