Spaces:

gkdud00
/

project-tdm

Running

File size: 4,876 Bytes

7134b06

import torch
import torch.nn.functional as F
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util

# 디바이스 설정 (GPU 우선, 없으면 CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ 현재 실행 환경: {device}")

# =============================================================================
# 2. 모델 로드 (시간이 조금 걸릴 수 있습니다)
# =============================================================================
print("\n⏳ [1/3] KoBART 요약 모델 로딩 중...")
kobart_summarizer = pipeline(
    "summarization",
    model="gogamza/kobart-summarization",
    device=0 if torch.cuda.is_available() else -1
)

print("⏳ [2/3] SBERT 유사도 모델 로딩 중...")
sbert_model = SentenceTransformer('jhgan/ko-sroberta-multitask')

print("⏳ [3/3] NLI(모순 탐지) 모델 로딩 중...")
nli_model_name = "Huffon/klue-roberta-base-nli"
nli_tokenizer = AutoTokenizer.from_pretrained(nli_model_name)
nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name).to(device)

print("🎉 모든 모델 로드 완료!\n")

# =============================================================================
# 3. 도우미 함수 정의 (Worker Functions)
# =============================================================================

def summarize_kobart_strict(text):
    """KoBART를 사용하여 본문을 요약합니다."""
    # 본문이 너무 짧으면 요약 생략 (오류 방지)
    if len(text) < 50: 
        return text
    
    try:
        result = kobart_summarizer(
            text,
            min_length=15, 
            max_length=128,
            num_beams=4, 
            no_repeat_ngram_size=3, 
            early_stopping=True
        )[0]['summary_text']
        return result.strip()
    except Exception as e:
        return text[:100] # 실패 시 앞부분 반환

def get_cosine_similarity(title, summary):
    """SBERT로 제목과 요약문의 코사인 유사도를 계산합니다."""
    emb1 = sbert_model.encode(title, convert_to_tensor=True)
    emb2 = sbert_model.encode(summary, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2).item()

def get_mismatch_score(summary, title):
    """NLI 모델로 요약문(전제)과 제목(가설) 사이의 모순 확률을 계산합니다."""
    inputs = nli_tokenizer(
        summary, title,
        return_tensors='pt',
        truncation=True,
        max_length=512
    ).to(device)
    
    # RoBERTa 모델 에러 방지 (token_type_ids 제거)
    if "token_type_ids" in inputs:
        del inputs["token_type_ids"]
    
    with torch.no_grad():
        outputs = nli_model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)[0]
    
    # Huffon/klue-roberta-base-nli 라벨 순서: [Entailment, Neutral, Contradiction]
    # 모순(Contradiction) 확률 반환 (Index 2)
    return round(probs[2].item(), 4)

# =============================================================================
# 4. 최종 메인 함수 (Main Logic)
# =============================================================================

def calculate_mismatch_score(article_title, article_body):
    """
    Grid Search 결과 최적 가중치 적용:
    - w1 (SBERT, 의미적 거리): 0.8
    - w2 (NLI, 논리적 모순): 0.2
    - Threshold (임계값): 0.45 이상이면 '위험'
    """
    #if not (kobart_summarizer and sbert_model and nli_model):
     #   return {"score": 0.0, "reason": "모델 로딩 실패", "recommendation": "서버 확인 필요"}

    # 1. 본문 요약
    summary = summarize_kobart_strict(article_body)
    
    # 2. SBERT 의미적 거리 (1 - 유사도)
    sbert_sim = get_cosine_similarity(article_title, summary)
    semantic_distance = 1 - sbert_sim

    # 3. NLI 논리적 모순 확률
    nli_contradiction = get_mismatch_score(summary, article_title)
    
    # 4. 최종 점수 산출
    w1, w2 = 0.8, 0.2
    final_score = (w1 * semantic_distance) + (w2 * nli_contradiction)
    reason = (
        f"[디버그 모드]\n"
        f"1. 요약문: {summary}\n"
        f"2. SBERT 거리: {semantic_distance:.4f}\n"
        f"3. NLI 모순: {nli_contradiction:.4f}"
    )
    
    #reason = f"제목과 본문의 의미적 거리({semantic_distance:.4f})와 모순 확률({nli_contradiction:.4f})이 반영되었습니다."
    
    # 5. 결과 판정 (Threshold 0.45 기준)
    if final_score >= 0.45:
        recommendation = "제목이 본문의 내용을 왜곡하거나 모순될 가능성이 높습니다."
    else:
        recommendation = "제목과 본문의 내용이 일치합니다."

    # main.py로 전달할 데이터
    return {
        "score": round(final_score, 4),
        "reason": reason,
        "recommendation": recommendation
    }