File size: 1,785 Bytes
9368f67 6755335 9368f67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
"""
FILE: 06_reranker.py
PURPOSE:
- Improve ranking accuracy by comparing query + result pairs using a CrossEncoder
- Works on top FAISS candidates and reorders them based on semantic relevance
REQUIREMENTS:
pip install sentence-transformers
"""
from sentence_transformers import CrossEncoder
# Best model for semantic relevance matching
# UPGRADE: Using BAAI/bge-reranker-base for State-of-the-Art Accuracy
# This model outperforms MS-MARCO MiniLM significantly
RERANK_MODEL = "BAAI/bge-reranker-base"
class Reranker:
def __init__(self):
print(f"🤖 Loading reranking model: {RERANK_MODEL} (Max Accuracy Mode)")
# Use full 512 token context window for maximum accuracy
self.model = CrossEncoder(RERANK_MODEL, max_length=512)
def rerank(self, query, candidates):
"""
candidates = list of dict objects:
[
{"name": "", "domain": "", "category": "", "region": "", "text": "...", "score": number}
]
"""
if not candidates:
return []
# Clean text for better model understanding (replace separators with commas)
pairs = []
for c in candidates:
# Format: Query -> Title, Category, Region, Description
# This gives the model explicit structure to judge relevance
# "Name (Category) - Description"
clean_text = f"{c['name']} ({c['category']} in {c['region']}): {c['text'].replace('•', ', ')}"
pairs.append((query, clean_text))
# Predict scores
scores = self.model.predict(pairs)
# Attach and sort
for i, s in enumerate(scores):
candidates[i]["rerank_score"] = float(s)
return sorted(candidates, key=lambda x: x["rerank_score"], reverse=True) |