Spaces:

yair319732
/

slogan

Sleeping

App Files Files Community

yair319732 commited on Aug 23

Commit

1b43026

verified ·

1 Parent(s): b136135

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

README.md +3 -24
app.py +15 -54
data/slogan.csv +0 -0
logic/cleaning.py +23 -66
logic/search.py +4 -7
requirements.txt +0 -1

README.md CHANGED Viewed

@@ -4,31 +4,10 @@ emoji: 🏷️
 colorFrom: yellow
 colorTo: green
 sdk: gradio
-sdk_version: 5.43.1
 app_file: app.py
 pinned: false
 ---
-# Slogan Finder — Hugging Face Space
-This Space searches your company's **real taglines** via **Sentence-Transformers + FAISS** and an optional **CrossEncoder** re-ranker.
-## TL;DR (works now with sample data)
-1. Click "Spaces" → "Create new Space" → SDK: **Gradio** → set **Python 3.10**.
-2. Upload this repo (or `hf-slogan-space.zip`) contents to the Space.
-3. The Space will boot and run on a tiny sample dataset so you can see it working.
-4. Replace the sample data with **your full dataset** and run `prepare_assets.py` locally to generate new `assets/`. Commit those to the Space.
-## Use with your real data
-- Export a CSV/Parquet from your notebook with at least a `tagline` column (optional `description`).
-- Update `INPUT_PATH` in `prepare_assets.py` to point at your file.
-- Run locally:
-  ```bash
-  pip install -r requirements.txt
-  python prepare_assets.py
-  python scripts/run_local_validation.py
-  ```
-- Commit the generated `assets/` directory to your Space repo and push.
-## Notes
-- Cosine similarity is used by default (`IndexFlatIP` + normalized embeddings). If you prefer L2, set `NORMALIZE=False` in `prepare_assets.py`.
-- The UI lets you toggle CrossEncoder reranking at runtime.

 colorFrom: yellow
 colorTo: green
 sdk: gradio
+sdk_version: "4.0.0"
 app_file: app.py
 pinned: false
 ---
+# Slogan Finder
+Search **real slogans** (SBERT + FAISS) and get **1 AI-generated** suggestion.

app.py CHANGED Viewed

@@ -1,6 +1,4 @@
-# app.py  — uses a pre-created dataset at data/slogan.csv
-# Output: EXACTLY 3 vector-based slogans + 1 AI-generated slogan
 import os, json, numpy as np, pandas as pd
 import gradio as gr
 import faiss
@@ -11,24 +9,19 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from logic.cleaning import clean_dataframe
 from logic.search import SloganSearcher
-# ===================== Config =====================
 ASSETS_DIR = "assets"
-DATA_PATH  = "data/slogan.csv"   # <-- your pre-created dataset (CSV) with columns: tagline, description
-# Retrieval encoder (cosine via inner product)
 MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
-NORMALIZE  = True  # True -> use IndexFlatIP (cosine), False -> IndexFlatL2
-# Generator (CPU-friendly)
 GEN_MODEL_NAME = "google/flan-t5-base"
 NUM_GEN_CANDIDATES = 6
 MAX_NEW_TOKENS = 24
 TEMPERATURE = 0.9
 TOP_P = 0.95
-# Generated slogan should not be too similar to any of the retrieved ones
 NOVELTY_SIM_THRESHOLD = 0.80
-# Asset paths
 META_PATH    = os.path.join(ASSETS_DIR, "meta.json")
 PARQUET_PATH = os.path.join(ASSETS_DIR, "slogans_clean.parquet")
 INDEX_PATH   = os.path.join(ASSETS_DIR, "faiss.index")
@@ -36,11 +29,9 @@ EMB_PATH     = os.path.join(ASSETS_DIR, "embeddings.npy")
 def _log(m): print(f"[SLOGAN-SPACE] {m}", flush=True)
-# ===================== Build assets from data/slogan.csv =====================
 def _build_assets():
     if not os.path.exists(DATA_PATH):
-        raise FileNotFoundError(f"Dataset not found at {DATA_PATH}. Expecting a CSV with 'tagline' and optional 'description'.")
     os.makedirs(ASSETS_DIR, exist_ok=True)
     _log(f"Loading dataset: {DATA_PATH}")
@@ -50,7 +41,6 @@ def _build_assets():
     df = clean_dataframe(df)
     _log(f"Rows after cleaning: {len(df)}")
-    # Choose text field for embeddings
     if "description" in df.columns and df["description"].notna().any():
         texts = df["description"].fillna(df["tagline"]).astype(str).tolist()
         text_col, fallback_col = "description", "tagline"
@@ -82,7 +72,6 @@ def _build_assets():
     }
     with open(META_PATH, "w") as f:
         json.dump(meta, f, indent=2)
     _log("Assets built successfully.")
 def _ensure_assets():
@@ -101,43 +90,28 @@ def _ensure_assets():
         _log(f"Parquet read failed ({e}); rebuilding assets.")
         _build_assets()
-# ===================== Bootstrap BEFORE UI =====================
 _ensure_assets()
-# ===================== Load retrieval & generator =====================
-# Retrieval searcher (uses assets + same encoder as in meta.json)
 searcher = SloganSearcher(assets_dir=ASSETS_DIR, use_rerank=False)
-# Encoder for novelty check (same as retrieval)
-_meta = json.load(open(META_PATH))
-_encoder = SentenceTransformer(_meta["model_name"])
-# Generator (FLAN-T5)
 _gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
 _gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
 def _prompt_for(description: str) -> str:
     return (
-        "You are a creative brand copywriter. Write short, original, memorable startup slogans (max 8 words).\n"
-        "Forbidden words: app, assistant, platform, solution, system, marketplace, AI, machine learning, augmented reality, virtual reality, decentralized, empower.\n"
-        "Focus on clear benefits and vivid verbs. Do not copy the description. Return ONLY a list, one slogan per line.\n\n"
-        "Good Examples:\n"
-        "Description: AI assistant for doctors to prioritize patient cases\n"
-        "Slogan: Less Guessing. More Healing.\n\n"
-        "Description: Payments for small online stores\n"
-        "Slogan: Built to Grow with Your Cart.\n\n"
-        "Description: Neurotech headset to boost focus\n"
-        "Slogan: Train Your Brain to Win.\n\n"
-        "Description: Interior design suggestions with AI\n"
-        "Slogan: Style That Thinks With You.\n\n"
-        "Bad Examples (avoid these): Innovative AI Platform / Smart App for Everyone / Empowering Small Businesses\n\n"
-        "for the following product/company description:\n\n"
-        f"{description}\n\nSlogan:"
     )
 def _generate_candidates(description: str, n: int = NUM_GEN_CANDIDATES):
     prompt = _prompt_for(description)
-    inputs = _gen_tokenizer([prompt] * n, return_tensors="pt", padding=True, truncation=True)
     outputs = _gen_model.generate(
         **inputs,
         do_sample=True,
@@ -151,40 +125,29 @@ def _generate_candidates(description: str, n: int = NUM_GEN_CANDIDATES):
     return [t.replace("Slogan:", "").strip().strip('"') for t in texts if t.strip()]
 def _pick_most_novel(candidates, retrieved_texts):
-    """
-    Choose the candidate with the lowest max cosine similarity to any retrieved slogan.
-    """
     if not candidates:
         return None
     R = _encoder.encode(retrieved_texts, convert_to_numpy=True, normalize_embeddings=True) if retrieved_texts else None
     best, best_novelty = None, -1e9
     for c in candidates:
         c_emb = _encoder.encode([c], convert_to_numpy=True, normalize_embeddings=True)
         if R is None or len(retrieved_texts) == 0:
             max_sim = 0.0
         else:
-            sims = np.dot(R, c_emb[0])  # cosine (embeddings are normalized)
             max_sim = float(np.max(sims))
         novelty = 1.0 - max_sim
         if (max_sim < NOVELTY_SIM_THRESHOLD and novelty > best_novelty) or best is None and novelty > best_novelty:
             best, best_novelty = c, novelty
     return best
-# ===================== Inference (exactly 3 + 1) =====================
 def run_pipeline(user_description: str):
     if not user_description or not user_description.strip():
         return "Please enter a description."
-    # 1) Retrieve top-3 vector matches
     retrieved_df = searcher.search(user_description, top_k=3, rerank_top_n=10)
     retrieved_texts = retrieved_df["display"].tolist() if not retrieved_df.empty else []
-    # 2) Generate candidates and pick a novel one
-    gen_candidates = _generate_candidates(user_description, NUM_GEN_CANDIDATES)
-    generated = _pick_most_novel(gen_candidates, retrieved_texts) or (gen_candidates[0] if gen_candidates else "—")
-    # 3) Render results
     lines = []
     lines.append("### 🔎 Top 3 similar slogans")
     if retrieved_texts:
@@ -192,12 +155,10 @@ def run_pipeline(user_description: str):
             lines.append(f"{i}. {s}")
     else:
         lines.append("_No similar slogans found._")
     lines.append("\n### ✨ AI-generated suggestion")
     lines.append(generated)
     return "\n".join(lines)
-# ===================== UI =====================
 with gr.Blocks(title="Slogan Finder") as demo:
     gr.Markdown("# 🔎 Slogan Finder\nDescribe your product/company; get 3 similar slogans + 1 AI-generated suggestion.")
     query = gr.Textbox(label="Describe your product/company", placeholder="AI-powered patient financial navigation platform...")

+\
 import os, json, numpy as np, pandas as pd
 import gradio as gr
 import faiss
 from logic.cleaning import clean_dataframe
 from logic.search import SloganSearcher
 ASSETS_DIR = "assets"
+DATA_PATH  = "data/slogan.csv"
 MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+NORMALIZE  = True
 GEN_MODEL_NAME = "google/flan-t5-base"
 NUM_GEN_CANDIDATES = 6
 MAX_NEW_TOKENS = 24
 TEMPERATURE = 0.9
 TOP_P = 0.95
 NOVELTY_SIM_THRESHOLD = 0.80
 META_PATH    = os.path.join(ASSETS_DIR, "meta.json")
 PARQUET_PATH = os.path.join(ASSETS_DIR, "slogans_clean.parquet")
 INDEX_PATH   = os.path.join(ASSETS_DIR, "faiss.index")
 def _log(m): print(f"[SLOGAN-SPACE] {m}", flush=True)
 def _build_assets():
     if not os.path.exists(DATA_PATH):
+        raise FileNotFoundError(f"Dataset not found at {DATA_PATH} (CSV with columns: 'tagline', 'description').")
     os.makedirs(ASSETS_DIR, exist_ok=True)
     _log(f"Loading dataset: {DATA_PATH}")
     df = clean_dataframe(df)
     _log(f"Rows after cleaning: {len(df)}")
     if "description" in df.columns and df["description"].notna().any():
         texts = df["description"].fillna(df["tagline"]).astype(str).tolist()
         text_col, fallback_col = "description", "tagline"
     }
     with open(META_PATH, "w") as f:
         json.dump(meta, f, indent=2)
     _log("Assets built successfully.")
 def _ensure_assets():
         _log(f"Parquet read failed ({e}); rebuilding assets.")
         _build_assets()
 _ensure_assets()
 searcher = SloganSearcher(assets_dir=ASSETS_DIR, use_rerank=False)
+meta = json.load(open(META_PATH))
+_encoder = SentenceTransformer(meta["model_name"])
 _gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
 _gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
+# ---- Prompt (adjust if you want your exact wording) ----
 def _prompt_for(description: str) -> str:
     return (
+        "You are a professional slogan writer. "
+        "Write ONE original, catchy startup slogan under 8 words, Title Case, no punctuation. "
+        "Do not copy examples. Description:\n"
+        f"{description}\nSlogan:"
     )
 def _generate_candidates(description: str, n: int = NUM_GEN_CANDIDATES):
     prompt = _prompt_for(description)
+    inputs = _gen_tokenizer([prompt]*n, return_tensors="pt", padding=True, truncation=True)
     outputs = _gen_model.generate(
         **inputs,
         do_sample=True,
     return [t.replace("Slogan:", "").strip().strip('"') for t in texts if t.strip()]
 def _pick_most_novel(candidates, retrieved_texts):
     if not candidates:
         return None
     R = _encoder.encode(retrieved_texts, convert_to_numpy=True, normalize_embeddings=True) if retrieved_texts else None
     best, best_novelty = None, -1e9
     for c in candidates:
         c_emb = _encoder.encode([c], convert_to_numpy=True, normalize_embeddings=True)
         if R is None or len(retrieved_texts) == 0:
             max_sim = 0.0
         else:
+            sims = np.dot(R, c_emb[0])  # cosine
             max_sim = float(np.max(sims))
         novelty = 1.0 - max_sim
         if (max_sim < NOVELTY_SIM_THRESHOLD and novelty > best_novelty) or best is None and novelty > best_novelty:
             best, best_novelty = c, novelty
     return best
 def run_pipeline(user_description: str):
     if not user_description or not user_description.strip():
         return "Please enter a description."
     retrieved_df = searcher.search(user_description, top_k=3, rerank_top_n=10)
     retrieved_texts = retrieved_df["display"].tolist() if not retrieved_df.empty else []
+    gens = _generate_candidates(user_description, NUM_GEN_CANDIDATES)
+    generated = _pick_most_novel(gens, retrieved_texts) or (gens[0] if gens else "—")
     lines = []
     lines.append("### 🔎 Top 3 similar slogans")
     if retrieved_texts:
             lines.append(f"{i}. {s}")
     else:
         lines.append("_No similar slogans found._")
     lines.append("\n### ✨ AI-generated suggestion")
     lines.append(generated)
     return "\n".join(lines)
 with gr.Blocks(title="Slogan Finder") as demo:
     gr.Markdown("# 🔎 Slogan Finder\nDescribe your product/company; get 3 similar slogans + 1 AI-generated suggestion.")
     query = gr.Textbox(label="Describe your product/company", placeholder="AI-powered patient financial navigation platform...")

data/slogan.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

logic/cleaning.py CHANGED Viewed

@@ -1,24 +1,19 @@
-# logic/cleaning.py
 import pandas as pd
-import re
-import unicodedata
 from html import unescape
-# ==== Tunables (match your EDA) ====
-MIN_LEN = 20         # based on your histogram (most taglines 20–60 chars)
 MAX_LEN = 60
-KEEP_ASCII_ONLY = False   # set True if you want to drop non-ASCII taglines
-MIN_ALPHA_RATIO = 0.60    # at least 60% letters to avoid gibberish
-DROP_IF_ALL_CAPS = False  # set True if you want to drop SHOUTY taglines
-# Very generic/buzzy words to exclude (your Colab had anti-buzz heuristics)
-# Keep short and conservative to avoid over-filtering
 BUZZY = {
-    "synergy", "cutting edge", "cutting-edge", "best in class", "best-in-class",
-    "world class", "world-class", "state of the art", "state-of-the-art",
-    "revolutionary", "disruptive platform", "next generation", "next-gen",
-    "leading provider", "scalable solution"
 }
 URL_RE   = re.compile(r"(https?://|www\.)\S+", re.I)
@@ -26,35 +21,27 @@ EMAIL_RE = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.I)
 PHONE_RE = re.compile(r"(\+?\d[\d\-\s()]{6,}\d)")
 WS_RE    = re.compile(r"\s+")
 PUNCT_RE = re.compile(r"[^\w\s]+")
-# things you wanted removed frequently
-TRADEMARKS_RE = re.compile(r"[®©™]")
-def _nfkc(s: str) -> str:
-    return unicodedata.normalize("NFKC", s)
-def _normalize_spaces(s: str) -> str:
-    return WS_RE.sub(" ", s).strip()
 def _clean_text(s: str) -> str:
     s = "" if s is None else str(s)
     s = unescape(s)
     s = _nfkc(s)
-    s = s.replace("\n", " ").replace("\r", " ")
-    s = TRADEMARKS_RE.sub("", s)
-    s = _normalize_spaces(s)
     return s
 def _alpha_ratio(s: str) -> float:
-    if not s:
-        return 0.0
     letters = sum(ch.isalpha() for ch in s)
     return letters / max(1, len(s))
 def _looks_shouty(s: str) -> bool:
     letters = [ch for ch in s if ch.isalpha()]
-    if not letters:
-        return False
     uppers = sum(ch.isupper() for ch in letters)
     return uppers / len(letters) >= 0.85
@@ -67,73 +54,43 @@ def _has_junk(s: str) -> bool:
 def _ascii_only(s: str) -> bool:
     try:
-        s.encode("ascii")
-        return True
     except Exception:
         return False
-def _norm_for_dupe_key(s: str) -> str:
-    # robust duplicate key: lowercase, strip punctuation & collapse spaces
     s = s.lower()
     s = PUNCT_RE.sub(" ", s)
-    s = _normalize_spaces(s)
     return s
 def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Full cleaning aligned with your Colab/EDA:
-      - normalize text (NFKC, remove TM/®/©, collapse spaces)
-      - drop rows with URLs/emails/phones
-      - optional ASCII gate
-      - enforce alpha ratio to avoid gibberish
-      - apply strict length band (20–60 chars by default)
-      - drop shouty lines (optional)
-      - remove generic/buzzy marketing boilerplate
-      - robust de-duplication (punct/space-insensitive)
-    Required: 'tagline'. Optional: 'description' (falls back to tagline).
-    """
     if "tagline" not in df.columns:
-        raise ValueError("Input data must contain a 'tagline' column.")
     df = df.copy()
-    # ensure description exists (your earlier cells often used description for embeddings)
     if "description" not in df.columns:
         df["description"] = df["tagline"]
-    # normalize both columns
     df["tagline"] = df["tagline"].map(_clean_text)
     df["description"] = df["description"].map(_clean_text)
-    # drop empties after normalization
     df = df[(df["tagline"].str.len() > 0)]
-    # remove obvious junk (links, emails, phones)
     mask_junk = df["tagline"].map(_has_junk) | df["description"].map(_has_junk)
     df = df[~mask_junk]
-    # optional: ASCII only
     if KEEP_ASCII_ONLY:
         df = df[df["tagline"].map(_ascii_only)]
-    # alpha ratio (avoid too-symbolic/noisy strings)
     df = df[df["tagline"].map(_alpha_ratio) >= MIN_ALPHA_RATIO]
-    # length band from your EDA
     df = df[df["tagline"].str.len().between(MIN_LEN, MAX_LEN)]
-    # optional: drop SHOUTY
     if DROP_IF_ALL_CAPS:
         df = df[~df["tagline"].map(_looks_shouty)]
-    # exclude very buzzy phrases
     df = df[~df["tagline"].map(_contains_buzzy)]
-    # final robust de-duplication (ignore punctuation/case/extra spaces)
-    dupe_key = df["tagline"].map(_norm_for_dupe_key)
-    df = df.loc[~dupe_key.duplicated()].reset_index(drop=True)
-    # if description is empty after cleaning, fall back to tagline
     df.loc[df["description"].str.len() == 0, "description"] = df["tagline"]
     return df

+\
 import pandas as pd
+import re, unicodedata
 from html import unescape
+MIN_LEN = 20
 MAX_LEN = 60
+KEEP_ASCII_ONLY = False
+MIN_ALPHA_RATIO = 0.60
+DROP_IF_ALL_CAPS = False
 BUZZY = {
+    "synergy","cutting edge","cutting-edge","best in class","best-in-class",
+    "world class","world-class","state of the art","state-of-the-art",
+    "revolutionary","disruptive platform","next generation","next-gen",
+    "leading provider","scalable solution"
 }
 URL_RE   = re.compile(r"(https?://|www\.)\S+", re.I)
 PHONE_RE = re.compile(r"(\+?\d[\d\-\s()]{6,}\d)")
 WS_RE    = re.compile(r"\s+")
 PUNCT_RE = re.compile(r"[^\w\s]+")
+TM_RE    = re.compile(r"[®©™]")
+def _nfkc(s): return unicodedata.normalize("NFKC", s)
 def _clean_text(s: str) -> str:
     s = "" if s is None else str(s)
     s = unescape(s)
     s = _nfkc(s)
+    s = s.replace("\n"," ").replace("\r"," ")
+    s = TM_RE.sub("", s)
+    s = WS_RE.sub(" ", s).strip()
     return s
 def _alpha_ratio(s: str) -> float:
+    if not s: return 0.0
     letters = sum(ch.isalpha() for ch in s)
     return letters / max(1, len(s))
 def _looks_shouty(s: str) -> bool:
     letters = [ch for ch in s if ch.isalpha()]
+    if not letters: return False
     uppers = sum(ch.isupper() for ch in letters)
     return uppers / len(letters) >= 0.85
 def _ascii_only(s: str) -> bool:
     try:
+        s.encode("ascii"); return True
     except Exception:
         return False
+def _dupe_key(s: str) -> str:
     s = s.lower()
     s = PUNCT_RE.sub(" ", s)
+    s = WS_RE.sub(" ", s).strip()
     return s
 def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
     if "tagline" not in df.columns:
+        raise ValueError("Input must contain a 'tagline' column.")
     df = df.copy()
     if "description" not in df.columns:
         df["description"] = df["tagline"]
     df["tagline"] = df["tagline"].map(_clean_text)
     df["description"] = df["description"].map(_clean_text)
     df = df[(df["tagline"].str.len() > 0)]
     mask_junk = df["tagline"].map(_has_junk) | df["description"].map(_has_junk)
     df = df[~mask_junk]
     if KEEP_ASCII_ONLY:
         df = df[df["tagline"].map(_ascii_only)]
     df = df[df["tagline"].map(_alpha_ratio) >= MIN_ALPHA_RATIO]
     df = df[df["tagline"].str.len().between(MIN_LEN, MAX_LEN)]
     if DROP_IF_ALL_CAPS:
         df = df[~df["tagline"].map(_looks_shouty)]
     df = df[~df["tagline"].map(_contains_buzzy)]
+    key = df["tagline"].map(_dupe_key)
+    df = df.loc[~key.duplicated()].reset_index(drop=True)
     df.loc[df["description"].str.len() == 0, "description"] = df["tagline"]
     return df

logic/search.py CHANGED Viewed

@@ -1,5 +1,6 @@
-import json, numpy as np, pandas as pd, os
 import faiss
 from sentence_transformers import SentenceTransformer, CrossEncoder
@@ -7,7 +8,7 @@ class SloganSearcher:
     def __init__(self, assets_dir="assets", use_rerank=False, rerank_model="cross-encoder/stsb-roberta-base"):
         meta_path = os.path.join(assets_dir, "meta.json")
         if not os.path.exists(meta_path):
-            raise FileNotFoundError(f"Missing {meta_path}. Run prepare_assets.py first.")
         with open(meta_path, "r") as f:
             self.meta = json.load(f)
@@ -25,15 +26,12 @@ class SloganSearcher:
     def search(self, query: str, top_k=5, rerank_top_n=20):
         if not isinstance(query, str) or len(query.strip()) == 0:
             return pd.DataFrame(columns=["display", "score"] + (["rerank_score"] if self.use_rerank else []))
         q = self.encoder.encode([query], convert_to_numpy=True, normalize_embeddings=self.norm)
         sims, idxs = self.index.search(q, max(int(top_k), int(rerank_top_n) if self.use_rerank else int(top_k)))
         idxs = idxs[0].tolist()
         sims = sims[0].tolist()
         results = self.df.iloc[idxs].copy()
         results["score"] = sims
         if self.use_rerank:
             texts = results[self.text_col].fillna(results[self.fallback_col]).astype(str).tolist()
             pairs = [[query, t] for t in texts]
@@ -42,7 +40,6 @@ class SloganSearcher:
             results = results.sort_values("rerank_score", ascending=False).head(int(top_k))
         else:
             results = results.head(int(top_k))
         results["display"] = results[self.fallback_col]
         cols = ["display", "score"] + (["rerank_score"] if self.use_rerank else [])
         return results[cols]

+\
+import json, os
+import numpy as np, pandas as pd
 import faiss
 from sentence_transformers import SentenceTransformer, CrossEncoder
     def __init__(self, assets_dir="assets", use_rerank=False, rerank_model="cross-encoder/stsb-roberta-base"):
         meta_path = os.path.join(assets_dir, "meta.json")
         if not os.path.exists(meta_path):
+            raise FileNotFoundError(f"Missing {meta_path}. Build assets first.")
         with open(meta_path, "r") as f:
             self.meta = json.load(f)
     def search(self, query: str, top_k=5, rerank_top_n=20):
         if not isinstance(query, str) or len(query.strip()) == 0:
             return pd.DataFrame(columns=["display", "score"] + (["rerank_score"] if self.use_rerank else []))
         q = self.encoder.encode([query], convert_to_numpy=True, normalize_embeddings=self.norm)
         sims, idxs = self.index.search(q, max(int(top_k), int(rerank_top_n) if self.use_rerank else int(top_k)))
         idxs = idxs[0].tolist()
         sims = sims[0].tolist()
         results = self.df.iloc[idxs].copy()
         results["score"] = sims
         if self.use_rerank:
             texts = results[self.text_col].fillna(results[self.fallback_col]).astype(str).tolist()
             pairs = [[query, t] for t in texts]
             results = results.sort_values("rerank_score", ascending=False).head(int(top_k))
         else:
             results = results.head(int(top_k))
         results["display"] = results[self.fallback_col]
         cols = ["display", "score"] + (["rerank_score"] if self.use_rerank else [])
         return results[cols]

requirements.txt CHANGED Viewed

@@ -6,5 +6,4 @@ pandas>=2.1.0
 numpy>=1.26.0
 pyarrow>=14.0.1
 torch
-kagglehub>=0.2.5
 transformers>=4.40.0

 numpy>=1.26.0
 pyarrow>=14.0.1
 torch
 transformers>=4.40.0