Spaces:

Danielos100
/

Gifty

Sleeping

App Files Files Community

Danielos100 commited on Aug 15

Commit

e1a3b1d

verified ·

1 Parent(s): c249c88

Update app.py

Browse files

Files changed (1) hide show

app.py +156 -83

app.py CHANGED Viewed

@@ -1,21 +1,30 @@
 # app.py
-# 🎁 GIfty — Smart Gift Recommender (English / USD)
-# Dataset: ckandemir/amazon-products (HF)
-# Baseline: TF-IDF + cosine. Optional: enable Embeddings + FAISS later.
 import os, re, random
-from typing import Dict, List
 import numpy as np
 import pandas as pd
-from datasets import load_dataset
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.neighbors import NearestNeighbors
 import gradio as gr
-# ============= Config =============
-USE_EMBEDDINGS = False
-MAX_ROWS = int(os.getenv("MAX_ROWS", "5000"))
-DEFAULT_OCCASIONS = "birthday, thank_you, housewarming"
 OCCASION_OPTIONS = [
     "birthday", "anniversary", "valentines", "graduation",
@@ -33,10 +42,17 @@ AGE_OPTIONS = {
 INTEREST_OPTIONS = [
     "reading","writing","tech","travel","fitness","cooking","tea","coffee",
     "games","movies","plants","music","design","stationery","home","experience",
-    "digital","aesthetic","premium","eco","practical","minimalist","social","party"
 ]
-# ============= Data loading & schema =============
 def _to_price_usd(x):
     s = str(x).strip().replace("$","").replace(",","")
     try: return float(s)
@@ -46,9 +62,28 @@ def _infer_age_from_category(cat: str) -> str:
     s = (cat or "").lower()
     if any(k in s for k in ["baby", "toddler", "infant"]): return "kids"
     if "toys & games" in s or "board games" in s or "toy" in s: return "kids"
-    if any(k in s for k in ["teen", "ya", "young adult"]): return "teens"
     return "any"
 def map_amazon_to_schema(df_raw: pd.DataFrame) -> pd.DataFrame:
     cols = {c.lower().strip(): c for c in df_raw.columns}
     get = lambda key: df_raw.get(cols.get(key, ""), "")
@@ -59,15 +94,18 @@ def map_amazon_to_schema(df_raw: pd.DataFrame) -> pd.DataFrame:
         "price_usd": get("selling price").map(_to_price_usd) if "selling price" in cols else np.nan,
         "age_range": "",
         "gender_tags": "any",
-        "occasion_tags": DEFAULT_OCCASIONS,
         "persona_fit": get("category"),
         "image_url": get("image") if "image" in cols else "",
     })
-    out["name"] = out["name"].astype(str).str.strip().str.slice(0,120)
-    out["short_desc"] = out["short_desc"].astype(str).str.strip().str.slice(0,400)
     out["tags"] = out["tags"].astype(str).str.replace("|", ", ").str.lower()
     out["persona_fit"] = out["persona_fit"].astype(str).str.lower()
-    out["age_range"] = out["tags"].map(_infer_age_from_category).fillna("any")
     return out
 def build_doc(row: pd.Series) -> str:
@@ -86,6 +124,7 @@ def load_catalog() -> pd.DataFrame:
         ds = load_dataset("ckandemir/amazon-products", split="train")
         raw = ds.to_pandas()
     except Exception:
         raw = pd.DataFrame({
             "Product Name": ["Wireless Earbuds", "Coffee Sampler", "Strategy Board Game"],
             "Description": [
@@ -105,18 +144,7 @@ def load_catalog() -> pd.DataFrame:
 CATALOG = load_catalog()
-# ============= Retrieval baseline (TF-IDF) =============
-_vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,2))
-_X = _vectorizer.fit_transform(CATALOG["doc"].fillna(""))
-_nn = NearestNeighbors(n_neighbors=10, metric="cosine").fit(_X)
-def profile_to_query(profile: Dict) -> str:
-    interests = ", ".join(profile.get("interests", []))
-    occasion = profile.get("occasion", "")
-    budget = profile.get("budget_usd", "")
-    age = profile.get("age_range", "any")
-    return f"{interests}. occasion: {occasion}. age: {age}. budget: {budget} USD."
 def _contains_ci(series: pd.Series, needle: str) -> pd.Series:
     if not needle: return pd.Series(True, index=series.index)
     pat = re.escape(needle)
@@ -135,11 +163,57 @@ def filter_business(df: pd.DataFrame, budget_min=None, budget_max=None,
         m &= (df["age_range"].fillna("any").isin([age_range, "any"]))
     return df[m]
-def recommend_topk(profile: Dict, k: int=3) -> pd.DataFrame:
-    """Global kNN → filter to the business subset (fixes index mismatch)."""
-    q = profile_to_query(profile)
-    q_vec = _vectorizer.transform([q])
     df_f = filter_business(
         CATALOG,
         budget_min=profile.get("budget_min"),
@@ -147,51 +221,39 @@ def recommend_topk(profile: Dict, k: int=3) -> pd.DataFrame:
         occasion=profile.get("occasion"),
         age_range=profile.get("age_range","any"),
     )
-    if df_f.empty:
-        df_f = CATALOG
-    # Search on the global index, then keep only rows inside df_f
-    n_cand = min(max(k*50, k), len(CATALOG))
-    dists, inds = _nn.kneighbors(q_vec, n_neighbors=n_cand)
-    cand_global = inds[0]              # indices in CATALOG
-    d = dists[0]
-    order = np.argsort(d)              # ascending distance
     seen, picks = set(), []
-    for gi in cand_global[order]:
-        if gi not in df_f.index:       # keep only filtered subset
             continue
-        nm = CATALOG.loc[gi, "name"]
         if nm in seen:
             continue
         seen.add(nm)
-        # similarity = 1 - distance
-        sim = 1 - float(_nn.kneighbors_graph(q_vec, n_neighbors=1, mode="distance")[0, gi]) if False else 1.0
-        # we already have distances in d; recompute sim from them using same order index:
-        # get distance for this gi:
-        # (for simplicity we just set sim to 1 - current min distance; not critical for UI ranking)
-        picks.append((gi, None))
         if len(picks) >= k:
             break
     if not picks:
-        return df_f.head(k).assign(similarity=np.nan)[["name","short_desc","price_usd","occasion_tags","persona_fit","age_range","image_url","similarity"]]
-    sel = [gi for gi,_ in picks]
-    res = CATALOG.loc[sel].copy()
-    # compute similarity from the original distances vector for display
-    gi_to_dist = {int(gi): float(dist) for gi, dist in zip(cand_global, d)}
-    res["similarity"] = [1.0 - gi_to_dist.get(int(gi), 0.0) for gi in sel]
     return res[["name","short_desc","price_usd","occasion_tags","persona_fit","age_range","image_url","similarity"]]
-# ============= Synthetic item + message =============
 def generate_item(profile: Dict) -> Dict:
     interests = profile.get("interests", [])
-    occasion = profile.get("occasion","birthday")
-    budget = profile.get("budget_max", profile.get("budget_usd", 50)) or 50
-    age = profile.get("age_range","any")
-    core = (interests[0] if interests else "hobby").strip()
     style = random.choice(["personalized","experience","bundle"])
-    base_name, base_desc = "", ""
     if style == "personalized":
         base_name = f"Custom {core} accessory with initials"
         base_desc = f"Thoughtful personalized {core} accessory tailored to their taste."
@@ -207,7 +269,7 @@ def generate_item(profile: Dict) -> Dict:
         base_desc += " Trendy pick that suits young enthusiasts."
     elif age == "senior":
         base_desc += " Comfortable and easy to use."
-    price = float(np.clip(float(budget), 10, 250))
     return {
         "name": f"{base_name} ({occasion})",
         "short_desc": base_desc,
@@ -226,16 +288,25 @@ def generate_message(profile: Dict) -> str:
             f"Happy {occasion}! Wishing you health, joy, and wonderful memories. "
             f"May your goals come true. With {tone}.")
-# ============= Gradio UI (GIfty) =============
 EXAMPLES = [
-    [["reading","travel","aesthetic"], "birthday", [20, 60], "Noa", "adult (18–64)", "warm and friendly"],
-    [["coffee","home","practical"], "housewarming", [25, 45], "Daniel", "adult (18–64)", "warm"],
-    [["tech","digital"], "birthday", [30, 120], "Omer", "teen (13–17)", "fun"],
 ]
-def ui_predict(interests_list: List[str], occasion: str, budget_range, recipient_name: str, age_label: str, tone: str):
     try:
-        # budget_range is a tuple/list: (min, max)
         if isinstance(budget_range, (list, tuple)) and len(budget_range) == 2:
             budget_min, budget_max = float(budget_range[0]), float(budget_range[1])
         else:
@@ -255,30 +326,32 @@ def ui_predict(interests_list: List[str], occasion: str, budget_range, recipient
             "tone": tone or "warm and friendly",
         }
-        recs = recommend_topk(profile, k=3)
         gen = generate_item(profile)
         msg = generate_message(profile)
-        top3_md = recs[["name","short_desc","price_usd","age_range","similarity"]].to_markdown(index=False)
-        gen_md = f"**{gen['name']}**\n\n{gen['short_desc']}\n\n~${gen['price_usd']:.0f}"
         return top3_md, gen_md, msg
     except Exception as e:
         return f":warning: Error: {e}", "", ""
 with gr.Blocks() as demo:
-    gr.Markdown("# 🎁 GIfty — Smart Gift Recommender\n*Top-3 similar picks + 1 generated idea + personalized message*")
     with gr.Row():
         interests = gr.CheckboxGroup(
-            label="Interests (select a few)", choices=INTEREST_OPTIONS,
-            value=["reading","travel","aesthetic"], interactive=True
         )
     with gr.Row():
         occasion = gr.Dropdown(label="Occasion", choices=OCCASION_OPTIONS, value="birthday")
         age = gr.Dropdown(label="Age group", choices=list(AGE_OPTIONS.keys()), value="adult (18–64)")
-    # Range slider for budget (two handles)
-    budget = gr.Slider(label="Budget (USD)", minimum=5, maximum=500, step=1, value=(20, 60))
     with gr.Row():
         recipient_name = gr.Textbox(label="Recipient name", value="Noa")
@@ -286,18 +359,18 @@ with gr.Blocks() as demo:
     go = gr.Button("Get GIfty 🎯")
     out_top3 = gr.Markdown(label="Top-3 recommendations")
-    out_gen = gr.Markdown(label="Generated item")
-    out_msg = gr.Markdown(label="Personalized message")
     gr.Examples(
         EXAMPLES,
-        [interests, occasion, budget, recipient_name, age, tone],
         label="Quick examples",
     )
     go.click(
         ui_predict,
-        [interests, occasion, budget, recipient_name, age, tone],
         [out_top3, out_gen, out_msg]
     )

 # app.py
+# 🎁 GIfty — Smart Gift Recommender (Embeddings + FAISS)
+# Dataset: ckandemir/amazon-products (Hugging Face)
+# UI: Gradio (English)
+#
+# Requirements (requirements.txt):
+# gradio>=4.44.0
+# datasets>=3.0.0
+# pandas>=2.2.2
+# numpy>=1.26.4
+# sentence-transformers>=3.0.1
+# faiss-cpu>=1.8.0
+# tabulate>=0.9.0
 import os, re, random
+from typing import Dict, List, Tuple
 import numpy as np
 import pandas as pd
 import gradio as gr
+from datasets import load_dataset
+from sentence_transformers import SentenceTransformer
+import faiss
+# ========================= Config =========================
+MAX_ROWS = int(os.getenv("MAX_ROWS", "10000"))  # cap for speed
+TITLE = "# 🎁 GIfty — Smart Gift Recommender\n*Top-3 similar picks + 1 generated idea + personalized message*"
 OCCASION_OPTIONS = [
     "birthday", "anniversary", "valentines", "graduation",
 INTEREST_OPTIONS = [
     "reading","writing","tech","travel","fitness","cooking","tea","coffee",
     "games","movies","plants","music","design","stationery","home","experience",
+    "digital","aesthetic","premium","eco","practical","minimalist","social","party",
+    "photography","outdoors","pets","beauty","jewelry"
 ]
+MODEL_CHOICES = {
+    "MiniLM (384d)": "sentence-transformers/all-MiniLM-L6-v2",
+    "MPNet (768d)": "sentence-transformers/all-mpnet-base-v2",
+    "E5-base (768d)": "intfloat/e5-base-v2",
+}
+# ========================= Data loading & schema =========================
 def _to_price_usd(x):
     s = str(x).strip().replace("$","").replace(",","")
     try: return float(s)
     s = (cat or "").lower()
     if any(k in s for k in ["baby", "toddler", "infant"]): return "kids"
     if "toys & games" in s or "board games" in s or "toy" in s: return "kids"
+    if any(k in s for k in ["teen", "young adult", "ya"]): return "teens"
     return "any"
+def _infer_occasion_tags(cat: str) -> str:
+    s = (cat or "").lower()
+    tags = set(["birthday"])  # default
+    if any(k in s for k in ["home & kitchen","furniture","home décor","home decor","garden","tools","appliance","cookware","kitchen"]):
+        tags.update(["housewarming","thank_you"])
+    if any(k in s for k in ["beauty","jewelry","watch","fragrance","cosmetic","makeup","skincare"]):
+        tags.update(["valentines","anniversary"])
+    if any(k in s for k in ["toys","board game","puzzle","kids","lego"]):
+        tags.update(["hanukkah","christmas"])
+    if any(k in s for k in ["office","stationery","notebook","pen","planner"]):
+        tags.update(["graduation","thank_you"])
+    if any(k in s for k in ["electronics","camera","audio","headphones","gaming","computer"]):
+        tags.update(["birthday","christmas"])
+    if any(k in s for k in ["book","novel","literature"]):
+        tags.update(["graduation","thank_you"])
+    if any(k in s for k in ["sports","fitness","outdoor","camping","hiking","run","yoga"]):
+        tags.update(["birthday"])
+    return ",".join(sorted(tags))
 def map_amazon_to_schema(df_raw: pd.DataFrame) -> pd.DataFrame:
     cols = {c.lower().strip(): c for c in df_raw.columns}
     get = lambda key: df_raw.get(cols.get(key, ""), "")
         "price_usd": get("selling price").map(_to_price_usd) if "selling price" in cols else np.nan,
         "age_range": "",
         "gender_tags": "any",
+        "occasion_tags": "",
         "persona_fit": get("category"),
         "image_url": get("image") if "image" in cols else "",
     })
+    # clean
+    out["name"] = out["name"].astype(str).str.strip().str.slice(0, 120)
+    out["short_desc"] = out["short_desc"].astype(str).str.strip().str.slice(0, 500)
     out["tags"] = out["tags"].astype(str).str.replace("|", ", ").str.lower()
     out["persona_fit"] = out["persona_fit"].astype(str).str.lower()
+    # infer occasion & age
+    out["occasion_tags"] = out["tags"].map(_infer_occasion_tags)
+    out["age_range"]    = out["tags"].map(_infer_age_from_category).fillna("any")
     return out
 def build_doc(row: pd.Series) -> str:
         ds = load_dataset("ckandemir/amazon-products", split="train")
         raw = ds.to_pandas()
     except Exception:
+        # Fallback (keeps the app alive if internet is blocked)
         raw = pd.DataFrame({
             "Product Name": ["Wireless Earbuds", "Coffee Sampler", "Strategy Board Game"],
             "Description": [
 CATALOG = load_catalog()
+# ========================= Business filters =========================
 def _contains_ci(series: pd.Series, needle: str) -> pd.Series:
     if not needle: return pd.Series(True, index=series.index)
     pat = re.escape(needle)
         m &= (df["age_range"].fillna("any").isin([age_range, "any"]))
     return df[m]
+# ========================= Embeddings + FAISS =========================
+class EmbeddingStore:
+    def __init__(self, docs: List[str]):
+        self.docs = docs
+        self.model_cache: Dict[str, SentenceTransformer] = {}
+        self.index_cache: Dict[str, faiss.Index] = {}
+        self.dim_cache: Dict[str, int] = {}
+    def _build(self, model_id: str):
+        model = SentenceTransformer(model_id)
+        embs = model.encode(self.docs, convert_to_numpy=True, normalize_embeddings=True)
+        index = faiss.IndexFlatIP(embs.shape[1])  # cosine if normalized
+        index.add(embs)
+        self.model_cache[model_id] = model
+        self.index_cache[model_id] = index
+        self.dim_cache[model_id] = embs.shape[1]
+    def ensure_ready(self, model_id: str):
+        if model_id not in self.index_cache:
+            self._build(model_id)
+    def search(self, model_id: str, query: str, topn: int) -> Tuple[np.ndarray, np.ndarray]:
+        self.ensure_ready(model_id)
+        model = self.model_cache[model_id]
+        index = self.index_cache[model_id]
+        qv = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
+        sims, idxs = index.search(qv, topn)
+        return sims[0], idxs[0]
+EMB_STORE = EmbeddingStore(CATALOG["doc"].tolist())
+def profile_to_query(profile: Dict) -> str:
+    """Weighted, doc-aligned query: focuses on interests/occasion/age used in docs."""
+    interests = [t.strip().lower() for t in profile.get("interests", []) if t.strip()]
+    interests_expanded = interests + interests + interests  # weight *3
+    occasion = (profile.get("occasion", "") or "").lower()
+    age = profile.get("age_range", "any")
+    parts = []
+    if interests_expanded: parts.append(", ".join(interests_expanded))
+    if occasion: parts.append(occasion)
+    if age and age != "any": parts.append(age)
+    return " | ".join(parts).strip()
+def recommend_topk_embeddings(profile: Dict, model_key: str, k: int=3) -> pd.DataFrame:
+    model_id = MODEL_CHOICES.get(model_key, list(MODEL_CHOICES.values())[0])
+    query = profile_to_query(profile)
+    # global search on whole catalog
+    sims, idxs = EMB_STORE.search(model_id, query, topn=min(max(k*50, k), len(CATALOG)))
+    # filter to business subset
     df_f = filter_business(
         CATALOG,
         budget_min=profile.get("budget_min"),
         occasion=profile.get("occasion"),
         age_range=profile.get("age_range","any"),
     )
+    if df_f.empty: df_f = CATALOG
+    order = np.argsort(-sims)  # descending similarity
     seen, picks = set(), []
+    for gi in idxs[order]:
+        if gi not in df_f.index:  # keep only allowed subset
             continue
+        nm = CATALOG.loc[int(gi), "name"]
         if nm in seen:
             continue
         seen.add(nm)
+        picks.append(int(gi))
         if len(picks) >= k:
             break
     if not picks:
+        res = df_f.head(k).copy()
+        res["similarity"] = np.nan
+        return res[["name","short_desc","price_usd","occasion_tags","persona_fit","age_range","image_url","similarity"]]
+    gi_to_sim = {int(i): float(s) for i, s in zip(idxs, sims)}
+    res = CATALOG.loc[picks].copy()
+    res["similarity"] = [gi_to_sim.get(int(i), np.nan) for i in picks]
     return res[["name","short_desc","price_usd","occasion_tags","persona_fit","age_range","image_url","similarity"]]
+# ========================= Synthetic item + message =========================
 def generate_item(profile: Dict) -> Dict:
     interests = profile.get("interests", [])
+    occasion  = profile.get("occasion","birthday")
+    budget    = profile.get("budget_max", profile.get("budget_usd", 50)) or 50
+    age       = profile.get("age_range","any")
+    core = (interests[0] if interests else "hobby").strip() or "hobby"
     style = random.choice(["personalized","experience","bundle"])
     if style == "personalized":
         base_name = f"Custom {core} accessory with initials"
         base_desc = f"Thoughtful personalized {core} accessory tailored to their taste."
         base_desc += " Trendy pick that suits young enthusiasts."
     elif age == "senior":
         base_desc += " Comfortable and easy to use."
+    price = float(np.clip(float(budget), 10, 300))
     return {
         "name": f"{base_name} ({occasion})",
         "short_desc": base_desc,
             f"Happy {occasion}! Wishing you health, joy, and wonderful memories. "
             f"May your goals come true. With {tone}.")
+# ========================= Gradio UI =========================
 EXAMPLES = [
+    [["tech","music"], "birthday", [20, 60], "Noa", "adult (18–64)", "MiniLM (384d)", "warm and friendly"],
+    [["home","cooking","practical"], "housewarming", [25, 45], "Daniel", "adult (18–64)", "MiniLM (384d)", "warm"],
+    [["games","photography"], "birthday", [30, 120], "Omer", "teen (13–17)", "MPNet (768d)", "fun"],
+    [["reading","design","aesthetic"], "thank_you", [15, 35], "Maya", "any", "E5-base (768d)", "friendly"],
 ]
+def safe_markdown_table(df: pd.DataFrame) -> str:
+    try:
+        return df.to_markdown(index=False)
+    except Exception:
+        # fallback if tabulate is missing
+        return df.to_string(index=False)
+def ui_predict(interests_list: List[str], occasion: str, budget_range, recipient_name: str,
+               age_label: str, model_key: str, tone: str):
     try:
+        # Parse budget range [min, max]
         if isinstance(budget_range, (list, tuple)) and len(budget_range) == 2:
             budget_min, budget_max = float(budget_range[0]), float(budget_range[1])
         else:
             "tone": tone or "warm and friendly",
         }
+        recs = recommend_topk_embeddings(profile, model_key, k=3)
         gen = generate_item(profile)
         msg = generate_message(profile)
+        top3_md = safe_markdown_table(recs[["name","short_desc","price_usd","age_range","similarity"]])
+        gen_md  = f"**{gen['name']}**\n\n{gen['short_desc']}\n\n~${gen['price_usd']:.0f}"
         return top3_md, gen_md, msg
     except Exception as e:
         return f":warning: Error: {e}", "", ""
 with gr.Blocks() as demo:
+    gr.Markdown(TITLE)
     with gr.Row():
         interests = gr.CheckboxGroup(
+            label="Interests (select a few)",
+            choices=INTEREST_OPTIONS,
+            value=["tech","music"],
+            interactive=True
         )
     with gr.Row():
         occasion = gr.Dropdown(label="Occasion", choices=OCCASION_OPTIONS, value="birthday")
         age = gr.Dropdown(label="Age group", choices=list(AGE_OPTIONS.keys()), value="adult (18–64)")
+        model = gr.Dropdown(label="Embedding model", choices=list(MODEL_CHOICES.keys()), value="MiniLM (384d)")
+    budget = gr.RangeSlider(label="Budget range (USD)", minimum=5, maximum=500, step=1, value=[20, 60])
     with gr.Row():
         recipient_name = gr.Textbox(label="Recipient name", value="Noa")
     go = gr.Button("Get GIfty 🎯")
     out_top3 = gr.Markdown(label="Top-3 recommendations")
+    out_gen  = gr.Markdown(label="Generated item")
+    out_msg  = gr.Markdown(label="Personalized message")
     gr.Examples(
         EXAMPLES,
+        [interests, occasion, budget, recipient_name, age, model, tone],
         label="Quick examples",
     )
     go.click(
         ui_predict,
+        [interests, occasion, budget, recipient_name, age, model, tone],
         [out_top3, out_gen, out_msg]
     )