import pandas as pd import numpy as np import faiss from sentence_transformers import SentenceTransformer MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" DATA_FILE = "salahkar_enhanced.csv" OUTPUT_INDEX = "faiss_index.bin" OUTPUT_MAPPING = "index_mapping.csv" print("šŸ“Œ Loading dataset...") df = pd.read_csv(DATA_FILE) print("šŸ“Œ Loading embedding model...") model = SentenceTransformer(MODEL_NAME) print("šŸ“Œ Generating embeddings...") if "search_embedding_text" not in df.columns: print("⚠ 'search_embedding_text' column missing. Using 'name' instead.") texts = df["name"].astype(str).tolist() else: texts = df["search_embedding_text"].astype(str).tolist() embeddings = model.encode(texts, normalize_embeddings=True) embeddings = np.array(embeddings).astype("float32") print("šŸ“Œ Building FAISS index...") dimension = embeddings.shape[1] index = faiss.IndexFlatIP(dimension) index.add(embeddings) print("šŸ“Œ Saving FAISS index...") faiss.write_index(index, OUTPUT_INDEX) print("šŸ“Œ Saving index mapping...") df[["name", "domain", "category", "region"]].to_csv(OUTPUT_MAPPING, index=False) print("\nšŸŽ‰ SUCCESS: New FAISS index created and synced with dataset!")