File size: 1,785 Bytes
9368f67
 
 
 
 
 
 
 
 
 
 
 
 
 
6755335
 
 
9368f67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
"""
FILE: 06_reranker.py

PURPOSE:
- Improve ranking accuracy by comparing query + result pairs using a CrossEncoder
- Works on top FAISS candidates and reorders them based on semantic relevance

REQUIREMENTS:
pip install sentence-transformers
"""

from sentence_transformers import CrossEncoder

# Best model for semantic relevance matching
# UPGRADE: Using BAAI/bge-reranker-base for State-of-the-Art Accuracy
# This model outperforms MS-MARCO MiniLM significantly
RERANK_MODEL = "BAAI/bge-reranker-base"

class Reranker:

    def __init__(self):
        print(f"🤖 Loading reranking model: {RERANK_MODEL} (Max Accuracy Mode)")
        # Use full 512 token context window for maximum accuracy
        self.model = CrossEncoder(RERANK_MODEL, max_length=512)

    def rerank(self, query, candidates):
        """
        candidates = list of dict objects:
        [
            {"name": "", "domain": "", "category": "", "region": "", "text": "...", "score": number}
        ]
        """
        if not candidates:
            return []

        # Clean text for better model understanding (replace separators with commas)
        pairs = []
        for c in candidates:
            # Format: Query -> Title, Category, Region, Description
            # This gives the model explicit structure to judge relevance
            # "Name (Category) - Description"
            clean_text = f"{c['name']} ({c['category']} in {c['region']}): {c['text'].replace('•', ', ')}"
            pairs.append((query, clean_text))

        # Predict scores
        scores = self.model.predict(pairs)

        # Attach and sort
        for i, s in enumerate(scores):
            candidates[i]["rerank_score"] = float(s)

        return sorted(candidates, key=lambda x: x["rerank_score"], reverse=True)