yair319732 commited on
Commit
1b43026
·
verified ·
1 Parent(s): b136135

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. README.md +3 -24
  2. app.py +15 -54
  3. data/slogan.csv +0 -0
  4. logic/cleaning.py +23 -66
  5. logic/search.py +4 -7
  6. requirements.txt +0 -1
README.md CHANGED
@@ -4,31 +4,10 @@ emoji: 🏷️
4
  colorFrom: yellow
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.43.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
- # Slogan Finder — Hugging Face Space
12
 
13
- This Space searches your company's **real taglines** via **Sentence-Transformers + FAISS** and an optional **CrossEncoder** re-ranker.
14
-
15
- ## TL;DR (works now with sample data)
16
- 1. Click "Spaces" → "Create new Space" → SDK: **Gradio** → set **Python 3.10**.
17
- 2. Upload this repo (or `hf-slogan-space.zip`) contents to the Space.
18
- 3. The Space will boot and run on a tiny sample dataset so you can see it working.
19
- 4. Replace the sample data with **your full dataset** and run `prepare_assets.py` locally to generate new `assets/`. Commit those to the Space.
20
-
21
- ## Use with your real data
22
- - Export a CSV/Parquet from your notebook with at least a `tagline` column (optional `description`).
23
- - Update `INPUT_PATH` in `prepare_assets.py` to point at your file.
24
- - Run locally:
25
- ```bash
26
- pip install -r requirements.txt
27
- python prepare_assets.py
28
- python scripts/run_local_validation.py
29
- ```
30
- - Commit the generated `assets/` directory to your Space repo and push.
31
-
32
- ## Notes
33
- - Cosine similarity is used by default (`IndexFlatIP` + normalized embeddings). If you prefer L2, set `NORMALIZE=False` in `prepare_assets.py`.
34
- - The UI lets you toggle CrossEncoder reranking at runtime.
 
4
  colorFrom: yellow
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: "4.0.0"
8
  app_file: app.py
9
  pinned: false
10
  ---
 
11
 
12
+ # Slogan Finder
13
+ Search **real slogans** (SBERT + FAISS) and get **1 AI-generated** suggestion.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,6 +1,4 @@
1
- # app.py — uses a pre-created dataset at data/slogan.csv
2
- # Output: EXACTLY 3 vector-based slogans + 1 AI-generated slogan
3
-
4
  import os, json, numpy as np, pandas as pd
5
  import gradio as gr
6
  import faiss
@@ -11,24 +9,19 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
11
  from logic.cleaning import clean_dataframe
12
  from logic.search import SloganSearcher
13
 
14
- # ===================== Config =====================
15
  ASSETS_DIR = "assets"
16
- DATA_PATH = "data/slogan.csv" # <-- your pre-created dataset (CSV) with columns: tagline, description
17
 
18
- # Retrieval encoder (cosine via inner product)
19
  MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
20
- NORMALIZE = True # True -> use IndexFlatIP (cosine), False -> IndexFlatL2
21
 
22
- # Generator (CPU-friendly)
23
  GEN_MODEL_NAME = "google/flan-t5-base"
24
  NUM_GEN_CANDIDATES = 6
25
  MAX_NEW_TOKENS = 24
26
  TEMPERATURE = 0.9
27
  TOP_P = 0.95
28
- # Generated slogan should not be too similar to any of the retrieved ones
29
  NOVELTY_SIM_THRESHOLD = 0.80
30
 
31
- # Asset paths
32
  META_PATH = os.path.join(ASSETS_DIR, "meta.json")
33
  PARQUET_PATH = os.path.join(ASSETS_DIR, "slogans_clean.parquet")
34
  INDEX_PATH = os.path.join(ASSETS_DIR, "faiss.index")
@@ -36,11 +29,9 @@ EMB_PATH = os.path.join(ASSETS_DIR, "embeddings.npy")
36
 
37
  def _log(m): print(f"[SLOGAN-SPACE] {m}", flush=True)
38
 
39
- # ===================== Build assets from data/slogan.csv =====================
40
  def _build_assets():
41
  if not os.path.exists(DATA_PATH):
42
- raise FileNotFoundError(f"Dataset not found at {DATA_PATH}. Expecting a CSV with 'tagline' and optional 'description'.")
43
-
44
  os.makedirs(ASSETS_DIR, exist_ok=True)
45
 
46
  _log(f"Loading dataset: {DATA_PATH}")
@@ -50,7 +41,6 @@ def _build_assets():
50
  df = clean_dataframe(df)
51
  _log(f"Rows after cleaning: {len(df)}")
52
 
53
- # Choose text field for embeddings
54
  if "description" in df.columns and df["description"].notna().any():
55
  texts = df["description"].fillna(df["tagline"]).astype(str).tolist()
56
  text_col, fallback_col = "description", "tagline"
@@ -82,7 +72,6 @@ def _build_assets():
82
  }
83
  with open(META_PATH, "w") as f:
84
  json.dump(meta, f, indent=2)
85
-
86
  _log("Assets built successfully.")
87
 
88
  def _ensure_assets():
@@ -101,43 +90,28 @@ def _ensure_assets():
101
  _log(f"Parquet read failed ({e}); rebuilding assets.")
102
  _build_assets()
103
 
104
- # ===================== Bootstrap BEFORE UI =====================
105
  _ensure_assets()
106
 
107
- # ===================== Load retrieval & generator =====================
108
- # Retrieval searcher (uses assets + same encoder as in meta.json)
109
  searcher = SloganSearcher(assets_dir=ASSETS_DIR, use_rerank=False)
110
 
111
- # Encoder for novelty check (same as retrieval)
112
- _meta = json.load(open(META_PATH))
113
- _encoder = SentenceTransformer(_meta["model_name"])
114
 
115
- # Generator (FLAN-T5)
116
  _gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
117
  _gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
118
 
 
119
  def _prompt_for(description: str) -> str:
120
  return (
121
- "You are a creative brand copywriter. Write short, original, memorable startup slogans (max 8 words).\n"
122
- "Forbidden words: app, assistant, platform, solution, system, marketplace, AI, machine learning, augmented reality, virtual reality, decentralized, empower.\n"
123
- "Focus on clear benefits and vivid verbs. Do not copy the description. Return ONLY a list, one slogan per line.\n\n"
124
- "Good Examples:\n"
125
- "Description: AI assistant for doctors to prioritize patient cases\n"
126
- "Slogan: Less Guessing. More Healing.\n\n"
127
- "Description: Payments for small online stores\n"
128
- "Slogan: Built to Grow with Your Cart.\n\n"
129
- "Description: Neurotech headset to boost focus\n"
130
- "Slogan: Train Your Brain to Win.\n\n"
131
- "Description: Interior design suggestions with AI\n"
132
- "Slogan: Style That Thinks With You.\n\n"
133
- "Bad Examples (avoid these): Innovative AI Platform / Smart App for Everyone / Empowering Small Businesses\n\n"
134
- "for the following product/company description:\n\n"
135
- f"{description}\n\nSlogan:"
136
  )
137
 
138
  def _generate_candidates(description: str, n: int = NUM_GEN_CANDIDATES):
139
  prompt = _prompt_for(description)
140
- inputs = _gen_tokenizer([prompt] * n, return_tensors="pt", padding=True, truncation=True)
141
  outputs = _gen_model.generate(
142
  **inputs,
143
  do_sample=True,
@@ -151,40 +125,29 @@ def _generate_candidates(description: str, n: int = NUM_GEN_CANDIDATES):
151
  return [t.replace("Slogan:", "").strip().strip('"') for t in texts if t.strip()]
152
 
153
  def _pick_most_novel(candidates, retrieved_texts):
154
- """
155
- Choose the candidate with the lowest max cosine similarity to any retrieved slogan.
156
- """
157
  if not candidates:
158
  return None
159
  R = _encoder.encode(retrieved_texts, convert_to_numpy=True, normalize_embeddings=True) if retrieved_texts else None
160
-
161
  best, best_novelty = None, -1e9
162
  for c in candidates:
163
  c_emb = _encoder.encode([c], convert_to_numpy=True, normalize_embeddings=True)
164
  if R is None or len(retrieved_texts) == 0:
165
  max_sim = 0.0
166
  else:
167
- sims = np.dot(R, c_emb[0]) # cosine (embeddings are normalized)
168
  max_sim = float(np.max(sims))
169
  novelty = 1.0 - max_sim
170
  if (max_sim < NOVELTY_SIM_THRESHOLD and novelty > best_novelty) or best is None and novelty > best_novelty:
171
  best, best_novelty = c, novelty
172
  return best
173
 
174
- # ===================== Inference (exactly 3 + 1) =====================
175
  def run_pipeline(user_description: str):
176
  if not user_description or not user_description.strip():
177
  return "Please enter a description."
178
-
179
- # 1) Retrieve top-3 vector matches
180
  retrieved_df = searcher.search(user_description, top_k=3, rerank_top_n=10)
181
  retrieved_texts = retrieved_df["display"].tolist() if not retrieved_df.empty else []
182
-
183
- # 2) Generate candidates and pick a novel one
184
- gen_candidates = _generate_candidates(user_description, NUM_GEN_CANDIDATES)
185
- generated = _pick_most_novel(gen_candidates, retrieved_texts) or (gen_candidates[0] if gen_candidates else "—")
186
-
187
- # 3) Render results
188
  lines = []
189
  lines.append("### 🔎 Top 3 similar slogans")
190
  if retrieved_texts:
@@ -192,12 +155,10 @@ def run_pipeline(user_description: str):
192
  lines.append(f"{i}. {s}")
193
  else:
194
  lines.append("_No similar slogans found._")
195
-
196
  lines.append("\n### ✨ AI-generated suggestion")
197
  lines.append(generated)
198
  return "\n".join(lines)
199
 
200
- # ===================== UI =====================
201
  with gr.Blocks(title="Slogan Finder") as demo:
202
  gr.Markdown("# 🔎 Slogan Finder\nDescribe your product/company; get 3 similar slogans + 1 AI-generated suggestion.")
203
  query = gr.Textbox(label="Describe your product/company", placeholder="AI-powered patient financial navigation platform...")
 
1
+ \
 
 
2
  import os, json, numpy as np, pandas as pd
3
  import gradio as gr
4
  import faiss
 
9
  from logic.cleaning import clean_dataframe
10
  from logic.search import SloganSearcher
11
 
 
12
  ASSETS_DIR = "assets"
13
+ DATA_PATH = "data/slogan.csv"
14
 
 
15
  MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
16
+ NORMALIZE = True
17
 
 
18
  GEN_MODEL_NAME = "google/flan-t5-base"
19
  NUM_GEN_CANDIDATES = 6
20
  MAX_NEW_TOKENS = 24
21
  TEMPERATURE = 0.9
22
  TOP_P = 0.95
 
23
  NOVELTY_SIM_THRESHOLD = 0.80
24
 
 
25
  META_PATH = os.path.join(ASSETS_DIR, "meta.json")
26
  PARQUET_PATH = os.path.join(ASSETS_DIR, "slogans_clean.parquet")
27
  INDEX_PATH = os.path.join(ASSETS_DIR, "faiss.index")
 
29
 
30
  def _log(m): print(f"[SLOGAN-SPACE] {m}", flush=True)
31
 
 
32
  def _build_assets():
33
  if not os.path.exists(DATA_PATH):
34
+ raise FileNotFoundError(f"Dataset not found at {DATA_PATH} (CSV with columns: 'tagline', 'description').")
 
35
  os.makedirs(ASSETS_DIR, exist_ok=True)
36
 
37
  _log(f"Loading dataset: {DATA_PATH}")
 
41
  df = clean_dataframe(df)
42
  _log(f"Rows after cleaning: {len(df)}")
43
 
 
44
  if "description" in df.columns and df["description"].notna().any():
45
  texts = df["description"].fillna(df["tagline"]).astype(str).tolist()
46
  text_col, fallback_col = "description", "tagline"
 
72
  }
73
  with open(META_PATH, "w") as f:
74
  json.dump(meta, f, indent=2)
 
75
  _log("Assets built successfully.")
76
 
77
  def _ensure_assets():
 
90
  _log(f"Parquet read failed ({e}); rebuilding assets.")
91
  _build_assets()
92
 
 
93
  _ensure_assets()
94
 
 
 
95
  searcher = SloganSearcher(assets_dir=ASSETS_DIR, use_rerank=False)
96
 
97
+ meta = json.load(open(META_PATH))
98
+ _encoder = SentenceTransformer(meta["model_name"])
 
99
 
 
100
  _gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
101
  _gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
102
 
103
+ # ---- Prompt (adjust if you want your exact wording) ----
104
  def _prompt_for(description: str) -> str:
105
  return (
106
+ "You are a professional slogan writer. "
107
+ "Write ONE original, catchy startup slogan under 8 words, Title Case, no punctuation. "
108
+ "Do not copy examples. Description:\n"
109
+ f"{description}\nSlogan:"
 
 
 
 
 
 
 
 
 
 
 
110
  )
111
 
112
  def _generate_candidates(description: str, n: int = NUM_GEN_CANDIDATES):
113
  prompt = _prompt_for(description)
114
+ inputs = _gen_tokenizer([prompt]*n, return_tensors="pt", padding=True, truncation=True)
115
  outputs = _gen_model.generate(
116
  **inputs,
117
  do_sample=True,
 
125
  return [t.replace("Slogan:", "").strip().strip('"') for t in texts if t.strip()]
126
 
127
  def _pick_most_novel(candidates, retrieved_texts):
 
 
 
128
  if not candidates:
129
  return None
130
  R = _encoder.encode(retrieved_texts, convert_to_numpy=True, normalize_embeddings=True) if retrieved_texts else None
 
131
  best, best_novelty = None, -1e9
132
  for c in candidates:
133
  c_emb = _encoder.encode([c], convert_to_numpy=True, normalize_embeddings=True)
134
  if R is None or len(retrieved_texts) == 0:
135
  max_sim = 0.0
136
  else:
137
+ sims = np.dot(R, c_emb[0]) # cosine
138
  max_sim = float(np.max(sims))
139
  novelty = 1.0 - max_sim
140
  if (max_sim < NOVELTY_SIM_THRESHOLD and novelty > best_novelty) or best is None and novelty > best_novelty:
141
  best, best_novelty = c, novelty
142
  return best
143
 
 
144
  def run_pipeline(user_description: str):
145
  if not user_description or not user_description.strip():
146
  return "Please enter a description."
 
 
147
  retrieved_df = searcher.search(user_description, top_k=3, rerank_top_n=10)
148
  retrieved_texts = retrieved_df["display"].tolist() if not retrieved_df.empty else []
149
+ gens = _generate_candidates(user_description, NUM_GEN_CANDIDATES)
150
+ generated = _pick_most_novel(gens, retrieved_texts) or (gens[0] if gens else "—")
 
 
 
 
151
  lines = []
152
  lines.append("### 🔎 Top 3 similar slogans")
153
  if retrieved_texts:
 
155
  lines.append(f"{i}. {s}")
156
  else:
157
  lines.append("_No similar slogans found._")
 
158
  lines.append("\n### ✨ AI-generated suggestion")
159
  lines.append(generated)
160
  return "\n".join(lines)
161
 
 
162
  with gr.Blocks(title="Slogan Finder") as demo:
163
  gr.Markdown("# 🔎 Slogan Finder\nDescribe your product/company; get 3 similar slogans + 1 AI-generated suggestion.")
164
  query = gr.Textbox(label="Describe your product/company", placeholder="AI-powered patient financial navigation platform...")
data/slogan.csv CHANGED
The diff for this file is too large to render. See raw diff
 
logic/cleaning.py CHANGED
@@ -1,24 +1,19 @@
1
- # logic/cleaning.py
2
-
3
  import pandas as pd
4
- import re
5
- import unicodedata
6
  from html import unescape
7
 
8
- # ==== Tunables (match your EDA) ====
9
- MIN_LEN = 20 # based on your histogram (most taglines 20–60 chars)
10
  MAX_LEN = 60
11
- KEEP_ASCII_ONLY = False # set True if you want to drop non-ASCII taglines
12
- MIN_ALPHA_RATIO = 0.60 # at least 60% letters to avoid gibberish
13
- DROP_IF_ALL_CAPS = False # set True if you want to drop SHOUTY taglines
14
 
15
- # Very generic/buzzy words to exclude (your Colab had anti-buzz heuristics)
16
- # Keep short and conservative to avoid over-filtering
17
  BUZZY = {
18
- "synergy", "cutting edge", "cutting-edge", "best in class", "best-in-class",
19
- "world class", "world-class", "state of the art", "state-of-the-art",
20
- "revolutionary", "disruptive platform", "next generation", "next-gen",
21
- "leading provider", "scalable solution"
22
  }
23
 
24
  URL_RE = re.compile(r"(https?://|www\.)\S+", re.I)
@@ -26,35 +21,27 @@ EMAIL_RE = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.I)
26
  PHONE_RE = re.compile(r"(\+?\d[\d\-\s()]{6,}\d)")
27
  WS_RE = re.compile(r"\s+")
28
  PUNCT_RE = re.compile(r"[^\w\s]+")
 
29
 
30
- # things you wanted removed frequently
31
- TRADEMARKS_RE = re.compile(r"[®©™]")
32
-
33
- def _nfkc(s: str) -> str:
34
- return unicodedata.normalize("NFKC", s)
35
-
36
- def _normalize_spaces(s: str) -> str:
37
- return WS_RE.sub(" ", s).strip()
38
 
39
  def _clean_text(s: str) -> str:
40
  s = "" if s is None else str(s)
41
  s = unescape(s)
42
  s = _nfkc(s)
43
- s = s.replace("\n", " ").replace("\r", " ")
44
- s = TRADEMARKS_RE.sub("", s)
45
- s = _normalize_spaces(s)
46
  return s
47
 
48
  def _alpha_ratio(s: str) -> float:
49
- if not s:
50
- return 0.0
51
  letters = sum(ch.isalpha() for ch in s)
52
  return letters / max(1, len(s))
53
 
54
  def _looks_shouty(s: str) -> bool:
55
  letters = [ch for ch in s if ch.isalpha()]
56
- if not letters:
57
- return False
58
  uppers = sum(ch.isupper() for ch in letters)
59
  return uppers / len(letters) >= 0.85
60
 
@@ -67,73 +54,43 @@ def _has_junk(s: str) -> bool:
67
 
68
  def _ascii_only(s: str) -> bool:
69
  try:
70
- s.encode("ascii")
71
- return True
72
  except Exception:
73
  return False
74
 
75
- def _norm_for_dupe_key(s: str) -> str:
76
- # robust duplicate key: lowercase, strip punctuation & collapse spaces
77
  s = s.lower()
78
  s = PUNCT_RE.sub(" ", s)
79
- s = _normalize_spaces(s)
80
  return s
81
 
82
  def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
83
- """
84
- Full cleaning aligned with your Colab/EDA:
85
- - normalize text (NFKC, remove TM/®/©, collapse spaces)
86
- - drop rows with URLs/emails/phones
87
- - optional ASCII gate
88
- - enforce alpha ratio to avoid gibberish
89
- - apply strict length band (20–60 chars by default)
90
- - drop shouty lines (optional)
91
- - remove generic/buzzy marketing boilerplate
92
- - robust de-duplication (punct/space-insensitive)
93
- Required: 'tagline'. Optional: 'description' (falls back to tagline).
94
- """
95
  if "tagline" not in df.columns:
96
- raise ValueError("Input data must contain a 'tagline' column.")
97
-
98
  df = df.copy()
99
-
100
- # ensure description exists (your earlier cells often used description for embeddings)
101
  if "description" not in df.columns:
102
  df["description"] = df["tagline"]
103
 
104
- # normalize both columns
105
  df["tagline"] = df["tagline"].map(_clean_text)
106
  df["description"] = df["description"].map(_clean_text)
107
 
108
- # drop empties after normalization
109
  df = df[(df["tagline"].str.len() > 0)]
110
-
111
- # remove obvious junk (links, emails, phones)
112
  mask_junk = df["tagline"].map(_has_junk) | df["description"].map(_has_junk)
113
  df = df[~mask_junk]
114
 
115
- # optional: ASCII only
116
  if KEEP_ASCII_ONLY:
117
  df = df[df["tagline"].map(_ascii_only)]
118
 
119
- # alpha ratio (avoid too-symbolic/noisy strings)
120
  df = df[df["tagline"].map(_alpha_ratio) >= MIN_ALPHA_RATIO]
121
-
122
- # length band from your EDA
123
  df = df[df["tagline"].str.len().between(MIN_LEN, MAX_LEN)]
124
 
125
- # optional: drop SHOUTY
126
  if DROP_IF_ALL_CAPS:
127
  df = df[~df["tagline"].map(_looks_shouty)]
128
 
129
- # exclude very buzzy phrases
130
  df = df[~df["tagline"].map(_contains_buzzy)]
131
 
132
- # final robust de-duplication (ignore punctuation/case/extra spaces)
133
- dupe_key = df["tagline"].map(_norm_for_dupe_key)
134
- df = df.loc[~dupe_key.duplicated()].reset_index(drop=True)
135
 
136
- # if description is empty after cleaning, fall back to tagline
137
  df.loc[df["description"].str.len() == 0, "description"] = df["tagline"]
138
-
139
  return df
 
1
+ \
 
2
  import pandas as pd
3
+ import re, unicodedata
 
4
  from html import unescape
5
 
6
+ MIN_LEN = 20
 
7
  MAX_LEN = 60
8
+ KEEP_ASCII_ONLY = False
9
+ MIN_ALPHA_RATIO = 0.60
10
+ DROP_IF_ALL_CAPS = False
11
 
 
 
12
  BUZZY = {
13
+ "synergy","cutting edge","cutting-edge","best in class","best-in-class",
14
+ "world class","world-class","state of the art","state-of-the-art",
15
+ "revolutionary","disruptive platform","next generation","next-gen",
16
+ "leading provider","scalable solution"
17
  }
18
 
19
  URL_RE = re.compile(r"(https?://|www\.)\S+", re.I)
 
21
  PHONE_RE = re.compile(r"(\+?\d[\d\-\s()]{6,}\d)")
22
  WS_RE = re.compile(r"\s+")
23
  PUNCT_RE = re.compile(r"[^\w\s]+")
24
+ TM_RE = re.compile(r"[®©™]")
25
 
26
+ def _nfkc(s): return unicodedata.normalize("NFKC", s)
 
 
 
 
 
 
 
27
 
28
  def _clean_text(s: str) -> str:
29
  s = "" if s is None else str(s)
30
  s = unescape(s)
31
  s = _nfkc(s)
32
+ s = s.replace("\n"," ").replace("\r"," ")
33
+ s = TM_RE.sub("", s)
34
+ s = WS_RE.sub(" ", s).strip()
35
  return s
36
 
37
  def _alpha_ratio(s: str) -> float:
38
+ if not s: return 0.0
 
39
  letters = sum(ch.isalpha() for ch in s)
40
  return letters / max(1, len(s))
41
 
42
  def _looks_shouty(s: str) -> bool:
43
  letters = [ch for ch in s if ch.isalpha()]
44
+ if not letters: return False
 
45
  uppers = sum(ch.isupper() for ch in letters)
46
  return uppers / len(letters) >= 0.85
47
 
 
54
 
55
  def _ascii_only(s: str) -> bool:
56
  try:
57
+ s.encode("ascii"); return True
 
58
  except Exception:
59
  return False
60
 
61
+ def _dupe_key(s: str) -> str:
 
62
  s = s.lower()
63
  s = PUNCT_RE.sub(" ", s)
64
+ s = WS_RE.sub(" ", s).strip()
65
  return s
66
 
67
  def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
68
  if "tagline" not in df.columns:
69
+ raise ValueError("Input must contain a 'tagline' column.")
 
70
  df = df.copy()
 
 
71
  if "description" not in df.columns:
72
  df["description"] = df["tagline"]
73
 
 
74
  df["tagline"] = df["tagline"].map(_clean_text)
75
  df["description"] = df["description"].map(_clean_text)
76
 
 
77
  df = df[(df["tagline"].str.len() > 0)]
 
 
78
  mask_junk = df["tagline"].map(_has_junk) | df["description"].map(_has_junk)
79
  df = df[~mask_junk]
80
 
 
81
  if KEEP_ASCII_ONLY:
82
  df = df[df["tagline"].map(_ascii_only)]
83
 
 
84
  df = df[df["tagline"].map(_alpha_ratio) >= MIN_ALPHA_RATIO]
 
 
85
  df = df[df["tagline"].str.len().between(MIN_LEN, MAX_LEN)]
86
 
 
87
  if DROP_IF_ALL_CAPS:
88
  df = df[~df["tagline"].map(_looks_shouty)]
89
 
 
90
  df = df[~df["tagline"].map(_contains_buzzy)]
91
 
92
+ key = df["tagline"].map(_dupe_key)
93
+ df = df.loc[~key.duplicated()].reset_index(drop=True)
 
94
 
 
95
  df.loc[df["description"].str.len() == 0, "description"] = df["tagline"]
 
96
  return df
logic/search.py CHANGED
@@ -1,5 +1,6 @@
1
-
2
- import json, numpy as np, pandas as pd, os
 
3
  import faiss
4
  from sentence_transformers import SentenceTransformer, CrossEncoder
5
 
@@ -7,7 +8,7 @@ class SloganSearcher:
7
  def __init__(self, assets_dir="assets", use_rerank=False, rerank_model="cross-encoder/stsb-roberta-base"):
8
  meta_path = os.path.join(assets_dir, "meta.json")
9
  if not os.path.exists(meta_path):
10
- raise FileNotFoundError(f"Missing {meta_path}. Run prepare_assets.py first.")
11
  with open(meta_path, "r") as f:
12
  self.meta = json.load(f)
13
 
@@ -25,15 +26,12 @@ class SloganSearcher:
25
  def search(self, query: str, top_k=5, rerank_top_n=20):
26
  if not isinstance(query, str) or len(query.strip()) == 0:
27
  return pd.DataFrame(columns=["display", "score"] + (["rerank_score"] if self.use_rerank else []))
28
-
29
  q = self.encoder.encode([query], convert_to_numpy=True, normalize_embeddings=self.norm)
30
  sims, idxs = self.index.search(q, max(int(top_k), int(rerank_top_n) if self.use_rerank else int(top_k)))
31
  idxs = idxs[0].tolist()
32
  sims = sims[0].tolist()
33
-
34
  results = self.df.iloc[idxs].copy()
35
  results["score"] = sims
36
-
37
  if self.use_rerank:
38
  texts = results[self.text_col].fillna(results[self.fallback_col]).astype(str).tolist()
39
  pairs = [[query, t] for t in texts]
@@ -42,7 +40,6 @@ class SloganSearcher:
42
  results = results.sort_values("rerank_score", ascending=False).head(int(top_k))
43
  else:
44
  results = results.head(int(top_k))
45
-
46
  results["display"] = results[self.fallback_col]
47
  cols = ["display", "score"] + (["rerank_score"] if self.use_rerank else [])
48
  return results[cols]
 
1
+ \
2
+ import json, os
3
+ import numpy as np, pandas as pd
4
  import faiss
5
  from sentence_transformers import SentenceTransformer, CrossEncoder
6
 
 
8
  def __init__(self, assets_dir="assets", use_rerank=False, rerank_model="cross-encoder/stsb-roberta-base"):
9
  meta_path = os.path.join(assets_dir, "meta.json")
10
  if not os.path.exists(meta_path):
11
+ raise FileNotFoundError(f"Missing {meta_path}. Build assets first.")
12
  with open(meta_path, "r") as f:
13
  self.meta = json.load(f)
14
 
 
26
  def search(self, query: str, top_k=5, rerank_top_n=20):
27
  if not isinstance(query, str) or len(query.strip()) == 0:
28
  return pd.DataFrame(columns=["display", "score"] + (["rerank_score"] if self.use_rerank else []))
 
29
  q = self.encoder.encode([query], convert_to_numpy=True, normalize_embeddings=self.norm)
30
  sims, idxs = self.index.search(q, max(int(top_k), int(rerank_top_n) if self.use_rerank else int(top_k)))
31
  idxs = idxs[0].tolist()
32
  sims = sims[0].tolist()
 
33
  results = self.df.iloc[idxs].copy()
34
  results["score"] = sims
 
35
  if self.use_rerank:
36
  texts = results[self.text_col].fillna(results[self.fallback_col]).astype(str).tolist()
37
  pairs = [[query, t] for t in texts]
 
40
  results = results.sort_values("rerank_score", ascending=False).head(int(top_k))
41
  else:
42
  results = results.head(int(top_k))
 
43
  results["display"] = results[self.fallback_col]
44
  cols = ["display", "score"] + (["rerank_score"] if self.use_rerank else [])
45
  return results[cols]
requirements.txt CHANGED
@@ -6,5 +6,4 @@ pandas>=2.1.0
6
  numpy>=1.26.0
7
  pyarrow>=14.0.1
8
  torch
9
- kagglehub>=0.2.5
10
  transformers>=4.40.0
 
6
  numpy>=1.26.0
7
  pyarrow>=14.0.1
8
  torch
 
9
  transformers>=4.40.0