File size: 1,260 Bytes
7c8312b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
DATA_FILE = "salahkar_enhanced.csv"
OUTPUT_INDEX = "faiss_index.bin"
OUTPUT_MAPPING = "index_mapping.csv"

print("πŸ“Œ Loading dataset...")
df = pd.read_csv(DATA_FILE)

print("πŸ“Œ Loading embedding model...")
model = SentenceTransformer(MODEL_NAME)

print("πŸ“Œ Generating embeddings...")
if "search_embedding_text" not in df.columns:
    print("⚠ 'search_embedding_text' column missing. Using 'name' instead.")
    texts = df["name"].astype(str).tolist()
else:
    texts = df["search_embedding_text"].astype(str).tolist()

embeddings = model.encode(texts, normalize_embeddings=True)
embeddings = np.array(embeddings).astype("float32")

print("πŸ“Œ Building FAISS index...")
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)

print("πŸ“Œ Saving FAISS index...")
faiss.write_index(index, OUTPUT_INDEX)

print("πŸ“Œ Saving index mapping...")
df[["name", "domain", "category", "region"]].to_csv(OUTPUT_MAPPING, index=False)

print("\nπŸŽ‰ SUCCESS: New FAISS index created and synced with dataset!")