""" FILE: 06_reranker.py PURPOSE: - Improve ranking accuracy by comparing query + result pairs using a CrossEncoder - Works on top FAISS candidates and reorders them based on semantic relevance REQUIREMENTS: pip install sentence-transformers """ from sentence_transformers import CrossEncoder # Best model for semantic relevance matching # UPGRADE: Using BAAI/bge-reranker-base for State-of-the-Art Accuracy # This model outperforms MS-MARCO MiniLM significantly RERANK_MODEL = "BAAI/bge-reranker-base" class Reranker: def __init__(self): print(f"🤖 Loading reranking model: {RERANK_MODEL} (Max Accuracy Mode)") # Use full 512 token context window for maximum accuracy self.model = CrossEncoder(RERANK_MODEL, max_length=512) def rerank(self, query, candidates): """ candidates = list of dict objects: [ {"name": "", "domain": "", "category": "", "region": "", "text": "...", "score": number} ] """ if not candidates: return [] # Clean text for better model understanding (replace separators with commas) pairs = [] for c in candidates: # Format: Query -> Title, Category, Region, Description # This gives the model explicit structure to judge relevance # "Name (Category) - Description" clean_text = f"{c['name']} ({c['category']} in {c['region']}): {c['text'].replace('•', ', ')}" pairs.append((query, clean_text)) # Predict scores scores = self.model.predict(pairs) # Attach and sort for i, s in enumerate(scores): candidates[i]["rerank_score"] = float(s) return sorted(candidates, key=lambda x: x["rerank_score"], reverse=True)