| import pandas as pd | |
| import numpy as np | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" | |
| DATA_FILE = "salahkar_enhanced.csv" | |
| OUTPUT_INDEX = "faiss_index.bin" | |
| OUTPUT_MAPPING = "index_mapping.csv" | |
| print("π Loading dataset...") | |
| df = pd.read_csv(DATA_FILE) | |
| print("π Loading embedding model...") | |
| model = SentenceTransformer(MODEL_NAME) | |
| print("π Generating embeddings...") | |
| if "search_embedding_text" not in df.columns: | |
| print("β 'search_embedding_text' column missing. Using 'name' instead.") | |
| texts = df["name"].astype(str).tolist() | |
| else: | |
| texts = df["search_embedding_text"].astype(str).tolist() | |
| embeddings = model.encode(texts, normalize_embeddings=True) | |
| embeddings = np.array(embeddings).astype("float32") | |
| print("π Building FAISS index...") | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatIP(dimension) | |
| index.add(embeddings) | |
| print("π Saving FAISS index...") | |
| faiss.write_index(index, OUTPUT_INDEX) | |
| print("π Saving index mapping...") | |
| df[["name", "domain", "category", "region"]].to_csv(OUTPUT_MAPPING, index=False) | |
| print("\nπ SUCCESS: New FAISS index created and synced with dataset!") | |