Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- README.md +3 -24
- app.py +15 -54
- data/slogan.csv +0 -0
- logic/cleaning.py +23 -66
- logic/search.py +4 -7
- requirements.txt +0 -1
README.md
CHANGED
|
@@ -4,31 +4,10 @@ emoji: 🏷️
|
|
| 4 |
colorFrom: yellow
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
-
# Slogan Finder — Hugging Face Space
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
## TL;DR (works now with sample data)
|
| 16 |
-
1. Click "Spaces" → "Create new Space" → SDK: **Gradio** → set **Python 3.10**.
|
| 17 |
-
2. Upload this repo (or `hf-slogan-space.zip`) contents to the Space.
|
| 18 |
-
3. The Space will boot and run on a tiny sample dataset so you can see it working.
|
| 19 |
-
4. Replace the sample data with **your full dataset** and run `prepare_assets.py` locally to generate new `assets/`. Commit those to the Space.
|
| 20 |
-
|
| 21 |
-
## Use with your real data
|
| 22 |
-
- Export a CSV/Parquet from your notebook with at least a `tagline` column (optional `description`).
|
| 23 |
-
- Update `INPUT_PATH` in `prepare_assets.py` to point at your file.
|
| 24 |
-
- Run locally:
|
| 25 |
-
```bash
|
| 26 |
-
pip install -r requirements.txt
|
| 27 |
-
python prepare_assets.py
|
| 28 |
-
python scripts/run_local_validation.py
|
| 29 |
-
```
|
| 30 |
-
- Commit the generated `assets/` directory to your Space repo and push.
|
| 31 |
-
|
| 32 |
-
## Notes
|
| 33 |
-
- Cosine similarity is used by default (`IndexFlatIP` + normalized embeddings). If you prefer L2, set `NORMALIZE=False` in `prepare_assets.py`.
|
| 34 |
-
- The UI lets you toggle CrossEncoder reranking at runtime.
|
|
|
|
| 4 |
colorFrom: yellow
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: "4.0.0"
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
|
|
|
| 11 |
|
| 12 |
+
# Slogan Finder
|
| 13 |
+
Search **real slogans** (SBERT + FAISS) and get **1 AI-generated** suggestion.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
# Output: EXACTLY 3 vector-based slogans + 1 AI-generated slogan
|
| 3 |
-
|
| 4 |
import os, json, numpy as np, pandas as pd
|
| 5 |
import gradio as gr
|
| 6 |
import faiss
|
|
@@ -11,24 +9,19 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
| 11 |
from logic.cleaning import clean_dataframe
|
| 12 |
from logic.search import SloganSearcher
|
| 13 |
|
| 14 |
-
# ===================== Config =====================
|
| 15 |
ASSETS_DIR = "assets"
|
| 16 |
-
DATA_PATH = "data/slogan.csv"
|
| 17 |
|
| 18 |
-
# Retrieval encoder (cosine via inner product)
|
| 19 |
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
| 20 |
-
NORMALIZE = True
|
| 21 |
|
| 22 |
-
# Generator (CPU-friendly)
|
| 23 |
GEN_MODEL_NAME = "google/flan-t5-base"
|
| 24 |
NUM_GEN_CANDIDATES = 6
|
| 25 |
MAX_NEW_TOKENS = 24
|
| 26 |
TEMPERATURE = 0.9
|
| 27 |
TOP_P = 0.95
|
| 28 |
-
# Generated slogan should not be too similar to any of the retrieved ones
|
| 29 |
NOVELTY_SIM_THRESHOLD = 0.80
|
| 30 |
|
| 31 |
-
# Asset paths
|
| 32 |
META_PATH = os.path.join(ASSETS_DIR, "meta.json")
|
| 33 |
PARQUET_PATH = os.path.join(ASSETS_DIR, "slogans_clean.parquet")
|
| 34 |
INDEX_PATH = os.path.join(ASSETS_DIR, "faiss.index")
|
|
@@ -36,11 +29,9 @@ EMB_PATH = os.path.join(ASSETS_DIR, "embeddings.npy")
|
|
| 36 |
|
| 37 |
def _log(m): print(f"[SLOGAN-SPACE] {m}", flush=True)
|
| 38 |
|
| 39 |
-
# ===================== Build assets from data/slogan.csv =====================
|
| 40 |
def _build_assets():
|
| 41 |
if not os.path.exists(DATA_PATH):
|
| 42 |
-
raise FileNotFoundError(f"Dataset not found at {DATA_PATH}
|
| 43 |
-
|
| 44 |
os.makedirs(ASSETS_DIR, exist_ok=True)
|
| 45 |
|
| 46 |
_log(f"Loading dataset: {DATA_PATH}")
|
|
@@ -50,7 +41,6 @@ def _build_assets():
|
|
| 50 |
df = clean_dataframe(df)
|
| 51 |
_log(f"Rows after cleaning: {len(df)}")
|
| 52 |
|
| 53 |
-
# Choose text field for embeddings
|
| 54 |
if "description" in df.columns and df["description"].notna().any():
|
| 55 |
texts = df["description"].fillna(df["tagline"]).astype(str).tolist()
|
| 56 |
text_col, fallback_col = "description", "tagline"
|
|
@@ -82,7 +72,6 @@ def _build_assets():
|
|
| 82 |
}
|
| 83 |
with open(META_PATH, "w") as f:
|
| 84 |
json.dump(meta, f, indent=2)
|
| 85 |
-
|
| 86 |
_log("Assets built successfully.")
|
| 87 |
|
| 88 |
def _ensure_assets():
|
|
@@ -101,43 +90,28 @@ def _ensure_assets():
|
|
| 101 |
_log(f"Parquet read failed ({e}); rebuilding assets.")
|
| 102 |
_build_assets()
|
| 103 |
|
| 104 |
-
# ===================== Bootstrap BEFORE UI =====================
|
| 105 |
_ensure_assets()
|
| 106 |
|
| 107 |
-
# ===================== Load retrieval & generator =====================
|
| 108 |
-
# Retrieval searcher (uses assets + same encoder as in meta.json)
|
| 109 |
searcher = SloganSearcher(assets_dir=ASSETS_DIR, use_rerank=False)
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
_encoder = SentenceTransformer(_meta["model_name"])
|
| 114 |
|
| 115 |
-
# Generator (FLAN-T5)
|
| 116 |
_gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
|
| 117 |
_gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
|
| 118 |
|
|
|
|
| 119 |
def _prompt_for(description: str) -> str:
|
| 120 |
return (
|
| 121 |
-
"You are a
|
| 122 |
-
"
|
| 123 |
-
"
|
| 124 |
-
"
|
| 125 |
-
"Description: AI assistant for doctors to prioritize patient cases\n"
|
| 126 |
-
"Slogan: Less Guessing. More Healing.\n\n"
|
| 127 |
-
"Description: Payments for small online stores\n"
|
| 128 |
-
"Slogan: Built to Grow with Your Cart.\n\n"
|
| 129 |
-
"Description: Neurotech headset to boost focus\n"
|
| 130 |
-
"Slogan: Train Your Brain to Win.\n\n"
|
| 131 |
-
"Description: Interior design suggestions with AI\n"
|
| 132 |
-
"Slogan: Style That Thinks With You.\n\n"
|
| 133 |
-
"Bad Examples (avoid these): Innovative AI Platform / Smart App for Everyone / Empowering Small Businesses\n\n"
|
| 134 |
-
"for the following product/company description:\n\n"
|
| 135 |
-
f"{description}\n\nSlogan:"
|
| 136 |
)
|
| 137 |
|
| 138 |
def _generate_candidates(description: str, n: int = NUM_GEN_CANDIDATES):
|
| 139 |
prompt = _prompt_for(description)
|
| 140 |
-
inputs = _gen_tokenizer([prompt]
|
| 141 |
outputs = _gen_model.generate(
|
| 142 |
**inputs,
|
| 143 |
do_sample=True,
|
|
@@ -151,40 +125,29 @@ def _generate_candidates(description: str, n: int = NUM_GEN_CANDIDATES):
|
|
| 151 |
return [t.replace("Slogan:", "").strip().strip('"') for t in texts if t.strip()]
|
| 152 |
|
| 153 |
def _pick_most_novel(candidates, retrieved_texts):
|
| 154 |
-
"""
|
| 155 |
-
Choose the candidate with the lowest max cosine similarity to any retrieved slogan.
|
| 156 |
-
"""
|
| 157 |
if not candidates:
|
| 158 |
return None
|
| 159 |
R = _encoder.encode(retrieved_texts, convert_to_numpy=True, normalize_embeddings=True) if retrieved_texts else None
|
| 160 |
-
|
| 161 |
best, best_novelty = None, -1e9
|
| 162 |
for c in candidates:
|
| 163 |
c_emb = _encoder.encode([c], convert_to_numpy=True, normalize_embeddings=True)
|
| 164 |
if R is None or len(retrieved_texts) == 0:
|
| 165 |
max_sim = 0.0
|
| 166 |
else:
|
| 167 |
-
sims = np.dot(R, c_emb[0]) # cosine
|
| 168 |
max_sim = float(np.max(sims))
|
| 169 |
novelty = 1.0 - max_sim
|
| 170 |
if (max_sim < NOVELTY_SIM_THRESHOLD and novelty > best_novelty) or best is None and novelty > best_novelty:
|
| 171 |
best, best_novelty = c, novelty
|
| 172 |
return best
|
| 173 |
|
| 174 |
-
# ===================== Inference (exactly 3 + 1) =====================
|
| 175 |
def run_pipeline(user_description: str):
|
| 176 |
if not user_description or not user_description.strip():
|
| 177 |
return "Please enter a description."
|
| 178 |
-
|
| 179 |
-
# 1) Retrieve top-3 vector matches
|
| 180 |
retrieved_df = searcher.search(user_description, top_k=3, rerank_top_n=10)
|
| 181 |
retrieved_texts = retrieved_df["display"].tolist() if not retrieved_df.empty else []
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
gen_candidates = _generate_candidates(user_description, NUM_GEN_CANDIDATES)
|
| 185 |
-
generated = _pick_most_novel(gen_candidates, retrieved_texts) or (gen_candidates[0] if gen_candidates else "—")
|
| 186 |
-
|
| 187 |
-
# 3) Render results
|
| 188 |
lines = []
|
| 189 |
lines.append("### 🔎 Top 3 similar slogans")
|
| 190 |
if retrieved_texts:
|
|
@@ -192,12 +155,10 @@ def run_pipeline(user_description: str):
|
|
| 192 |
lines.append(f"{i}. {s}")
|
| 193 |
else:
|
| 194 |
lines.append("_No similar slogans found._")
|
| 195 |
-
|
| 196 |
lines.append("\n### ✨ AI-generated suggestion")
|
| 197 |
lines.append(generated)
|
| 198 |
return "\n".join(lines)
|
| 199 |
|
| 200 |
-
# ===================== UI =====================
|
| 201 |
with gr.Blocks(title="Slogan Finder") as demo:
|
| 202 |
gr.Markdown("# 🔎 Slogan Finder\nDescribe your product/company; get 3 similar slogans + 1 AI-generated suggestion.")
|
| 203 |
query = gr.Textbox(label="Describe your product/company", placeholder="AI-powered patient financial navigation platform...")
|
|
|
|
| 1 |
+
\
|
|
|
|
|
|
|
| 2 |
import os, json, numpy as np, pandas as pd
|
| 3 |
import gradio as gr
|
| 4 |
import faiss
|
|
|
|
| 9 |
from logic.cleaning import clean_dataframe
|
| 10 |
from logic.search import SloganSearcher
|
| 11 |
|
|
|
|
| 12 |
ASSETS_DIR = "assets"
|
| 13 |
+
DATA_PATH = "data/slogan.csv"
|
| 14 |
|
|
|
|
| 15 |
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
| 16 |
+
NORMALIZE = True
|
| 17 |
|
|
|
|
| 18 |
GEN_MODEL_NAME = "google/flan-t5-base"
|
| 19 |
NUM_GEN_CANDIDATES = 6
|
| 20 |
MAX_NEW_TOKENS = 24
|
| 21 |
TEMPERATURE = 0.9
|
| 22 |
TOP_P = 0.95
|
|
|
|
| 23 |
NOVELTY_SIM_THRESHOLD = 0.80
|
| 24 |
|
|
|
|
| 25 |
META_PATH = os.path.join(ASSETS_DIR, "meta.json")
|
| 26 |
PARQUET_PATH = os.path.join(ASSETS_DIR, "slogans_clean.parquet")
|
| 27 |
INDEX_PATH = os.path.join(ASSETS_DIR, "faiss.index")
|
|
|
|
| 29 |
|
| 30 |
def _log(m): print(f"[SLOGAN-SPACE] {m}", flush=True)
|
| 31 |
|
|
|
|
| 32 |
def _build_assets():
|
| 33 |
if not os.path.exists(DATA_PATH):
|
| 34 |
+
raise FileNotFoundError(f"Dataset not found at {DATA_PATH} (CSV with columns: 'tagline', 'description').")
|
|
|
|
| 35 |
os.makedirs(ASSETS_DIR, exist_ok=True)
|
| 36 |
|
| 37 |
_log(f"Loading dataset: {DATA_PATH}")
|
|
|
|
| 41 |
df = clean_dataframe(df)
|
| 42 |
_log(f"Rows after cleaning: {len(df)}")
|
| 43 |
|
|
|
|
| 44 |
if "description" in df.columns and df["description"].notna().any():
|
| 45 |
texts = df["description"].fillna(df["tagline"]).astype(str).tolist()
|
| 46 |
text_col, fallback_col = "description", "tagline"
|
|
|
|
| 72 |
}
|
| 73 |
with open(META_PATH, "w") as f:
|
| 74 |
json.dump(meta, f, indent=2)
|
|
|
|
| 75 |
_log("Assets built successfully.")
|
| 76 |
|
| 77 |
def _ensure_assets():
|
|
|
|
| 90 |
_log(f"Parquet read failed ({e}); rebuilding assets.")
|
| 91 |
_build_assets()
|
| 92 |
|
|
|
|
| 93 |
_ensure_assets()
|
| 94 |
|
|
|
|
|
|
|
| 95 |
searcher = SloganSearcher(assets_dir=ASSETS_DIR, use_rerank=False)
|
| 96 |
|
| 97 |
+
meta = json.load(open(META_PATH))
|
| 98 |
+
_encoder = SentenceTransformer(meta["model_name"])
|
|
|
|
| 99 |
|
|
|
|
| 100 |
_gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
|
| 101 |
_gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
|
| 102 |
|
| 103 |
+
# ---- Prompt (adjust if you want your exact wording) ----
|
| 104 |
def _prompt_for(description: str) -> str:
|
| 105 |
return (
|
| 106 |
+
"You are a professional slogan writer. "
|
| 107 |
+
"Write ONE original, catchy startup slogan under 8 words, Title Case, no punctuation. "
|
| 108 |
+
"Do not copy examples. Description:\n"
|
| 109 |
+
f"{description}\nSlogan:"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
)
|
| 111 |
|
| 112 |
def _generate_candidates(description: str, n: int = NUM_GEN_CANDIDATES):
|
| 113 |
prompt = _prompt_for(description)
|
| 114 |
+
inputs = _gen_tokenizer([prompt]*n, return_tensors="pt", padding=True, truncation=True)
|
| 115 |
outputs = _gen_model.generate(
|
| 116 |
**inputs,
|
| 117 |
do_sample=True,
|
|
|
|
| 125 |
return [t.replace("Slogan:", "").strip().strip('"') for t in texts if t.strip()]
|
| 126 |
|
| 127 |
def _pick_most_novel(candidates, retrieved_texts):
|
|
|
|
|
|
|
|
|
|
| 128 |
if not candidates:
|
| 129 |
return None
|
| 130 |
R = _encoder.encode(retrieved_texts, convert_to_numpy=True, normalize_embeddings=True) if retrieved_texts else None
|
|
|
|
| 131 |
best, best_novelty = None, -1e9
|
| 132 |
for c in candidates:
|
| 133 |
c_emb = _encoder.encode([c], convert_to_numpy=True, normalize_embeddings=True)
|
| 134 |
if R is None or len(retrieved_texts) == 0:
|
| 135 |
max_sim = 0.0
|
| 136 |
else:
|
| 137 |
+
sims = np.dot(R, c_emb[0]) # cosine
|
| 138 |
max_sim = float(np.max(sims))
|
| 139 |
novelty = 1.0 - max_sim
|
| 140 |
if (max_sim < NOVELTY_SIM_THRESHOLD and novelty > best_novelty) or best is None and novelty > best_novelty:
|
| 141 |
best, best_novelty = c, novelty
|
| 142 |
return best
|
| 143 |
|
|
|
|
| 144 |
def run_pipeline(user_description: str):
|
| 145 |
if not user_description or not user_description.strip():
|
| 146 |
return "Please enter a description."
|
|
|
|
|
|
|
| 147 |
retrieved_df = searcher.search(user_description, top_k=3, rerank_top_n=10)
|
| 148 |
retrieved_texts = retrieved_df["display"].tolist() if not retrieved_df.empty else []
|
| 149 |
+
gens = _generate_candidates(user_description, NUM_GEN_CANDIDATES)
|
| 150 |
+
generated = _pick_most_novel(gens, retrieved_texts) or (gens[0] if gens else "—")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
lines = []
|
| 152 |
lines.append("### 🔎 Top 3 similar slogans")
|
| 153 |
if retrieved_texts:
|
|
|
|
| 155 |
lines.append(f"{i}. {s}")
|
| 156 |
else:
|
| 157 |
lines.append("_No similar slogans found._")
|
|
|
|
| 158 |
lines.append("\n### ✨ AI-generated suggestion")
|
| 159 |
lines.append(generated)
|
| 160 |
return "\n".join(lines)
|
| 161 |
|
|
|
|
| 162 |
with gr.Blocks(title="Slogan Finder") as demo:
|
| 163 |
gr.Markdown("# 🔎 Slogan Finder\nDescribe your product/company; get 3 similar slogans + 1 AI-generated suggestion.")
|
| 164 |
query = gr.Textbox(label="Describe your product/company", placeholder="AI-powered patient financial navigation platform...")
|
data/slogan.csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
logic/cleaning.py
CHANGED
|
@@ -1,24 +1,19 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
import pandas as pd
|
| 4 |
-
import re
|
| 5 |
-
import unicodedata
|
| 6 |
from html import unescape
|
| 7 |
|
| 8 |
-
|
| 9 |
-
MIN_LEN = 20 # based on your histogram (most taglines 20–60 chars)
|
| 10 |
MAX_LEN = 60
|
| 11 |
-
KEEP_ASCII_ONLY = False
|
| 12 |
-
MIN_ALPHA_RATIO = 0.60
|
| 13 |
-
DROP_IF_ALL_CAPS = False
|
| 14 |
|
| 15 |
-
# Very generic/buzzy words to exclude (your Colab had anti-buzz heuristics)
|
| 16 |
-
# Keep short and conservative to avoid over-filtering
|
| 17 |
BUZZY = {
|
| 18 |
-
"synergy",
|
| 19 |
-
"world class",
|
| 20 |
-
"revolutionary",
|
| 21 |
-
"leading provider",
|
| 22 |
}
|
| 23 |
|
| 24 |
URL_RE = re.compile(r"(https?://|www\.)\S+", re.I)
|
|
@@ -26,35 +21,27 @@ EMAIL_RE = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.I)
|
|
| 26 |
PHONE_RE = re.compile(r"(\+?\d[\d\-\s()]{6,}\d)")
|
| 27 |
WS_RE = re.compile(r"\s+")
|
| 28 |
PUNCT_RE = re.compile(r"[^\w\s]+")
|
|
|
|
| 29 |
|
| 30 |
-
|
| 31 |
-
TRADEMARKS_RE = re.compile(r"[®©™]")
|
| 32 |
-
|
| 33 |
-
def _nfkc(s: str) -> str:
|
| 34 |
-
return unicodedata.normalize("NFKC", s)
|
| 35 |
-
|
| 36 |
-
def _normalize_spaces(s: str) -> str:
|
| 37 |
-
return WS_RE.sub(" ", s).strip()
|
| 38 |
|
| 39 |
def _clean_text(s: str) -> str:
|
| 40 |
s = "" if s is None else str(s)
|
| 41 |
s = unescape(s)
|
| 42 |
s = _nfkc(s)
|
| 43 |
-
s = s.replace("\n",
|
| 44 |
-
s =
|
| 45 |
-
s =
|
| 46 |
return s
|
| 47 |
|
| 48 |
def _alpha_ratio(s: str) -> float:
|
| 49 |
-
if not s:
|
| 50 |
-
return 0.0
|
| 51 |
letters = sum(ch.isalpha() for ch in s)
|
| 52 |
return letters / max(1, len(s))
|
| 53 |
|
| 54 |
def _looks_shouty(s: str) -> bool:
|
| 55 |
letters = [ch for ch in s if ch.isalpha()]
|
| 56 |
-
if not letters:
|
| 57 |
-
return False
|
| 58 |
uppers = sum(ch.isupper() for ch in letters)
|
| 59 |
return uppers / len(letters) >= 0.85
|
| 60 |
|
|
@@ -67,73 +54,43 @@ def _has_junk(s: str) -> bool:
|
|
| 67 |
|
| 68 |
def _ascii_only(s: str) -> bool:
|
| 69 |
try:
|
| 70 |
-
s.encode("ascii")
|
| 71 |
-
return True
|
| 72 |
except Exception:
|
| 73 |
return False
|
| 74 |
|
| 75 |
-
def
|
| 76 |
-
# robust duplicate key: lowercase, strip punctuation & collapse spaces
|
| 77 |
s = s.lower()
|
| 78 |
s = PUNCT_RE.sub(" ", s)
|
| 79 |
-
s =
|
| 80 |
return s
|
| 81 |
|
| 82 |
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
| 83 |
-
"""
|
| 84 |
-
Full cleaning aligned with your Colab/EDA:
|
| 85 |
-
- normalize text (NFKC, remove TM/®/©, collapse spaces)
|
| 86 |
-
- drop rows with URLs/emails/phones
|
| 87 |
-
- optional ASCII gate
|
| 88 |
-
- enforce alpha ratio to avoid gibberish
|
| 89 |
-
- apply strict length band (20–60 chars by default)
|
| 90 |
-
- drop shouty lines (optional)
|
| 91 |
-
- remove generic/buzzy marketing boilerplate
|
| 92 |
-
- robust de-duplication (punct/space-insensitive)
|
| 93 |
-
Required: 'tagline'. Optional: 'description' (falls back to tagline).
|
| 94 |
-
"""
|
| 95 |
if "tagline" not in df.columns:
|
| 96 |
-
raise ValueError("Input
|
| 97 |
-
|
| 98 |
df = df.copy()
|
| 99 |
-
|
| 100 |
-
# ensure description exists (your earlier cells often used description for embeddings)
|
| 101 |
if "description" not in df.columns:
|
| 102 |
df["description"] = df["tagline"]
|
| 103 |
|
| 104 |
-
# normalize both columns
|
| 105 |
df["tagline"] = df["tagline"].map(_clean_text)
|
| 106 |
df["description"] = df["description"].map(_clean_text)
|
| 107 |
|
| 108 |
-
# drop empties after normalization
|
| 109 |
df = df[(df["tagline"].str.len() > 0)]
|
| 110 |
-
|
| 111 |
-
# remove obvious junk (links, emails, phones)
|
| 112 |
mask_junk = df["tagline"].map(_has_junk) | df["description"].map(_has_junk)
|
| 113 |
df = df[~mask_junk]
|
| 114 |
|
| 115 |
-
# optional: ASCII only
|
| 116 |
if KEEP_ASCII_ONLY:
|
| 117 |
df = df[df["tagline"].map(_ascii_only)]
|
| 118 |
|
| 119 |
-
# alpha ratio (avoid too-symbolic/noisy strings)
|
| 120 |
df = df[df["tagline"].map(_alpha_ratio) >= MIN_ALPHA_RATIO]
|
| 121 |
-
|
| 122 |
-
# length band from your EDA
|
| 123 |
df = df[df["tagline"].str.len().between(MIN_LEN, MAX_LEN)]
|
| 124 |
|
| 125 |
-
# optional: drop SHOUTY
|
| 126 |
if DROP_IF_ALL_CAPS:
|
| 127 |
df = df[~df["tagline"].map(_looks_shouty)]
|
| 128 |
|
| 129 |
-
# exclude very buzzy phrases
|
| 130 |
df = df[~df["tagline"].map(_contains_buzzy)]
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
df = df.loc[~dupe_key.duplicated()].reset_index(drop=True)
|
| 135 |
|
| 136 |
-
# if description is empty after cleaning, fall back to tagline
|
| 137 |
df.loc[df["description"].str.len() == 0, "description"] = df["tagline"]
|
| 138 |
-
|
| 139 |
return df
|
|
|
|
| 1 |
+
\
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
+
import re, unicodedata
|
|
|
|
| 4 |
from html import unescape
|
| 5 |
|
| 6 |
+
MIN_LEN = 20
|
|
|
|
| 7 |
MAX_LEN = 60
|
| 8 |
+
KEEP_ASCII_ONLY = False
|
| 9 |
+
MIN_ALPHA_RATIO = 0.60
|
| 10 |
+
DROP_IF_ALL_CAPS = False
|
| 11 |
|
|
|
|
|
|
|
| 12 |
BUZZY = {
|
| 13 |
+
"synergy","cutting edge","cutting-edge","best in class","best-in-class",
|
| 14 |
+
"world class","world-class","state of the art","state-of-the-art",
|
| 15 |
+
"revolutionary","disruptive platform","next generation","next-gen",
|
| 16 |
+
"leading provider","scalable solution"
|
| 17 |
}
|
| 18 |
|
| 19 |
URL_RE = re.compile(r"(https?://|www\.)\S+", re.I)
|
|
|
|
| 21 |
PHONE_RE = re.compile(r"(\+?\d[\d\-\s()]{6,}\d)")
|
| 22 |
WS_RE = re.compile(r"\s+")
|
| 23 |
PUNCT_RE = re.compile(r"[^\w\s]+")
|
| 24 |
+
TM_RE = re.compile(r"[®©™]")
|
| 25 |
|
| 26 |
+
def _nfkc(s): return unicodedata.normalize("NFKC", s)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
def _clean_text(s: str) -> str:
|
| 29 |
s = "" if s is None else str(s)
|
| 30 |
s = unescape(s)
|
| 31 |
s = _nfkc(s)
|
| 32 |
+
s = s.replace("\n"," ").replace("\r"," ")
|
| 33 |
+
s = TM_RE.sub("", s)
|
| 34 |
+
s = WS_RE.sub(" ", s).strip()
|
| 35 |
return s
|
| 36 |
|
| 37 |
def _alpha_ratio(s: str) -> float:
|
| 38 |
+
if not s: return 0.0
|
|
|
|
| 39 |
letters = sum(ch.isalpha() for ch in s)
|
| 40 |
return letters / max(1, len(s))
|
| 41 |
|
| 42 |
def _looks_shouty(s: str) -> bool:
|
| 43 |
letters = [ch for ch in s if ch.isalpha()]
|
| 44 |
+
if not letters: return False
|
|
|
|
| 45 |
uppers = sum(ch.isupper() for ch in letters)
|
| 46 |
return uppers / len(letters) >= 0.85
|
| 47 |
|
|
|
|
| 54 |
|
| 55 |
def _ascii_only(s: str) -> bool:
|
| 56 |
try:
|
| 57 |
+
s.encode("ascii"); return True
|
|
|
|
| 58 |
except Exception:
|
| 59 |
return False
|
| 60 |
|
| 61 |
+
def _dupe_key(s: str) -> str:
|
|
|
|
| 62 |
s = s.lower()
|
| 63 |
s = PUNCT_RE.sub(" ", s)
|
| 64 |
+
s = WS_RE.sub(" ", s).strip()
|
| 65 |
return s
|
| 66 |
|
| 67 |
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
if "tagline" not in df.columns:
|
| 69 |
+
raise ValueError("Input must contain a 'tagline' column.")
|
|
|
|
| 70 |
df = df.copy()
|
|
|
|
|
|
|
| 71 |
if "description" not in df.columns:
|
| 72 |
df["description"] = df["tagline"]
|
| 73 |
|
|
|
|
| 74 |
df["tagline"] = df["tagline"].map(_clean_text)
|
| 75 |
df["description"] = df["description"].map(_clean_text)
|
| 76 |
|
|
|
|
| 77 |
df = df[(df["tagline"].str.len() > 0)]
|
|
|
|
|
|
|
| 78 |
mask_junk = df["tagline"].map(_has_junk) | df["description"].map(_has_junk)
|
| 79 |
df = df[~mask_junk]
|
| 80 |
|
|
|
|
| 81 |
if KEEP_ASCII_ONLY:
|
| 82 |
df = df[df["tagline"].map(_ascii_only)]
|
| 83 |
|
|
|
|
| 84 |
df = df[df["tagline"].map(_alpha_ratio) >= MIN_ALPHA_RATIO]
|
|
|
|
|
|
|
| 85 |
df = df[df["tagline"].str.len().between(MIN_LEN, MAX_LEN)]
|
| 86 |
|
|
|
|
| 87 |
if DROP_IF_ALL_CAPS:
|
| 88 |
df = df[~df["tagline"].map(_looks_shouty)]
|
| 89 |
|
|
|
|
| 90 |
df = df[~df["tagline"].map(_contains_buzzy)]
|
| 91 |
|
| 92 |
+
key = df["tagline"].map(_dupe_key)
|
| 93 |
+
df = df.loc[~key.duplicated()].reset_index(drop=True)
|
|
|
|
| 94 |
|
|
|
|
| 95 |
df.loc[df["description"].str.len() == 0, "description"] = df["tagline"]
|
|
|
|
| 96 |
return df
|
logic/search.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
-
|
| 2 |
-
import json,
|
|
|
|
| 3 |
import faiss
|
| 4 |
from sentence_transformers import SentenceTransformer, CrossEncoder
|
| 5 |
|
|
@@ -7,7 +8,7 @@ class SloganSearcher:
|
|
| 7 |
def __init__(self, assets_dir="assets", use_rerank=False, rerank_model="cross-encoder/stsb-roberta-base"):
|
| 8 |
meta_path = os.path.join(assets_dir, "meta.json")
|
| 9 |
if not os.path.exists(meta_path):
|
| 10 |
-
raise FileNotFoundError(f"Missing {meta_path}.
|
| 11 |
with open(meta_path, "r") as f:
|
| 12 |
self.meta = json.load(f)
|
| 13 |
|
|
@@ -25,15 +26,12 @@ class SloganSearcher:
|
|
| 25 |
def search(self, query: str, top_k=5, rerank_top_n=20):
|
| 26 |
if not isinstance(query, str) or len(query.strip()) == 0:
|
| 27 |
return pd.DataFrame(columns=["display", "score"] + (["rerank_score"] if self.use_rerank else []))
|
| 28 |
-
|
| 29 |
q = self.encoder.encode([query], convert_to_numpy=True, normalize_embeddings=self.norm)
|
| 30 |
sims, idxs = self.index.search(q, max(int(top_k), int(rerank_top_n) if self.use_rerank else int(top_k)))
|
| 31 |
idxs = idxs[0].tolist()
|
| 32 |
sims = sims[0].tolist()
|
| 33 |
-
|
| 34 |
results = self.df.iloc[idxs].copy()
|
| 35 |
results["score"] = sims
|
| 36 |
-
|
| 37 |
if self.use_rerank:
|
| 38 |
texts = results[self.text_col].fillna(results[self.fallback_col]).astype(str).tolist()
|
| 39 |
pairs = [[query, t] for t in texts]
|
|
@@ -42,7 +40,6 @@ class SloganSearcher:
|
|
| 42 |
results = results.sort_values("rerank_score", ascending=False).head(int(top_k))
|
| 43 |
else:
|
| 44 |
results = results.head(int(top_k))
|
| 45 |
-
|
| 46 |
results["display"] = results[self.fallback_col]
|
| 47 |
cols = ["display", "score"] + (["rerank_score"] if self.use_rerank else [])
|
| 48 |
return results[cols]
|
|
|
|
| 1 |
+
\
|
| 2 |
+
import json, os
|
| 3 |
+
import numpy as np, pandas as pd
|
| 4 |
import faiss
|
| 5 |
from sentence_transformers import SentenceTransformer, CrossEncoder
|
| 6 |
|
|
|
|
| 8 |
def __init__(self, assets_dir="assets", use_rerank=False, rerank_model="cross-encoder/stsb-roberta-base"):
|
| 9 |
meta_path = os.path.join(assets_dir, "meta.json")
|
| 10 |
if not os.path.exists(meta_path):
|
| 11 |
+
raise FileNotFoundError(f"Missing {meta_path}. Build assets first.")
|
| 12 |
with open(meta_path, "r") as f:
|
| 13 |
self.meta = json.load(f)
|
| 14 |
|
|
|
|
| 26 |
def search(self, query: str, top_k=5, rerank_top_n=20):
|
| 27 |
if not isinstance(query, str) or len(query.strip()) == 0:
|
| 28 |
return pd.DataFrame(columns=["display", "score"] + (["rerank_score"] if self.use_rerank else []))
|
|
|
|
| 29 |
q = self.encoder.encode([query], convert_to_numpy=True, normalize_embeddings=self.norm)
|
| 30 |
sims, idxs = self.index.search(q, max(int(top_k), int(rerank_top_n) if self.use_rerank else int(top_k)))
|
| 31 |
idxs = idxs[0].tolist()
|
| 32 |
sims = sims[0].tolist()
|
|
|
|
| 33 |
results = self.df.iloc[idxs].copy()
|
| 34 |
results["score"] = sims
|
|
|
|
| 35 |
if self.use_rerank:
|
| 36 |
texts = results[self.text_col].fillna(results[self.fallback_col]).astype(str).tolist()
|
| 37 |
pairs = [[query, t] for t in texts]
|
|
|
|
| 40 |
results = results.sort_values("rerank_score", ascending=False).head(int(top_k))
|
| 41 |
else:
|
| 42 |
results = results.head(int(top_k))
|
|
|
|
| 43 |
results["display"] = results[self.fallback_col]
|
| 44 |
cols = ["display", "score"] + (["rerank_score"] if self.use_rerank else [])
|
| 45 |
return results[cols]
|
requirements.txt
CHANGED
|
@@ -6,5 +6,4 @@ pandas>=2.1.0
|
|
| 6 |
numpy>=1.26.0
|
| 7 |
pyarrow>=14.0.1
|
| 8 |
torch
|
| 9 |
-
kagglehub>=0.2.5
|
| 10 |
transformers>=4.40.0
|
|
|
|
| 6 |
numpy>=1.26.0
|
| 7 |
pyarrow>=14.0.1
|
| 8 |
torch
|
|
|
|
| 9 |
transformers>=4.40.0
|