Spaces:

rajendrr
/

nltk_practice_demo

Sleeping

App Files Files Community

rajendrr commited on Sep 7

Commit

a1b4641

verified ·

1 Parent(s): ca20429

Update app.py

Browse files

Files changed (1) hide show

app.py +256 -669

app.py CHANGED Viewed

@@ -1,693 +1,280 @@
-"""
-HF Space: Normalization + Twitter Sentiment Workbench
-Now with built-in datasets:
-  • Sentiment140  (HF datasets: sentiment140)
-  • TweetEval (sentiment)  (HF datasets: tweet_eval / sentiment)
-Tabs:
-  • Single Text – step-by-step normalization + sentiment bar
-  • Batch Tweets (CSV) – upload your own file
-  • Datasets – pull Sentiment140/TweetEval, sample/filter, analyze, and benchmark
-Models:
-  • VADER (fast baseline)
-  • Twitter-RoBERTa (cardiffnlp/twitter-roberta-base-sentiment-latest)
-Run locally:
-  pip install -r requirements.txt
-  python app.py
-"""
-import os
-import re
-import json
-from typing import List, Tuple, Optional, Dict
-from collections import Counter, defaultdict
 import gradio as gr
 import pandas as pd
 import numpy as np
-import matplotlib.pyplot as plt
-# ---- NLTK setup ----
-import nltk
-from nltk.corpus import stopwords, wordnet as wn
-from nltk.stem import WordNetLemmatizer
-from nltk.sentiment import SentimentIntensityAnalyzer
-from nltk.tokenize import TweetTokenizer
-for pkg in [
-    "punkt", "punkt_tab", "stopwords", "wordnet", "omw-1.4",
-    "averaged_perceptron_tagger", "averaged_perceptron_tagger_eng",
-    "vader_lexicon"
-]:
     try:
-        nltk.download(pkg, quiet=True)
     except Exception:
-        pass
-# ---- Transformers (Twitter-RoBERTa) ----
-TRANSFORMERS_AVAILABLE = True
-try:
-    from transformers import AutoTokenizer, AutoModelForSequenceClassification
-    import torch
-    import torch.nn.functional as F
-except Exception:
-    TRANSFORMERS_AVAILABLE = False
-# ---- Hugging Face datasets (for Sentiment140 / TweetEval) ----
-DATASETS_AVAILABLE = True
-try:
-    from datasets import load_dataset
-except Exception:
-    DATASETS_AVAILABLE = False
-# =========================
-# Core text normalization
-# =========================
-_punct_re = re.compile(r"[^\w\s]", flags=re.UNICODE)
-_tkn = TweetTokenizer()
-def remove_non_ascii(words: List[str]) -> List[str]:
-    out = []
-    for w in words:
-        ascii_w = "".join(ch for ch in w if ord(ch) < 128)
-        if ascii_w:
-            out.append(ascii_w)
-    return out
-def to_lowercase(words: List[str]) -> List[str]:
-    return [w.lower() for w in words]
-def remove_punctuation(words: List[str]) -> List[str]:
-    out = []
-    for w in words:
-        stripped = _punct_re.sub("", w)
-        if stripped:
-            out.append(stripped)
-    return out
-def _build_stopword_set() -> set:
-    base = set(stopwords.words("english"))
-    base |= {"rt","amp","https","http","t","co","u","s","us"}  # twitter-ish noise
-    stripped_variants = {_punct_re.sub("", w) for w in base}
-    return base | stripped_variants
-_STOPWORDS = _build_stopword_set()
-_lemmatizer = WordNetLemmatizer()
-def _to_wordnet_pos(treebank_tag: str):
-    if not treebank_tag:
-        return wn.NOUN
-    t = treebank_tag[0].upper()
-    if t == "J": return wn.ADJ
-    if t == "V": return wn.VERB
-    if t == "N": return wn.NOUN
-    if t == "R": return wn.ADV
-    return wn.NOUN
-def lemmatize_list(words: List[str]) -> List[str]:
-    try:
-        tagged = nltk.pos_tag(words)
-    except LookupError:
-        tagged = [(w, "N") for w in words]
-    return [_lemmatizer.lemmatize(w, _to_wordnet_pos(tag)) for w, tag in tagged]
-def tokenize(text: str) -> List[str]:
-    return _tkn.tokenize(text)
-def normalize(text: str) -> str:
-    """Full preprocessing pipeline (your original)."""
-    words = tokenize(text)
-    words = remove_non_ascii(words)
-    words = to_lowercase(words)
-    words = remove_punctuation(words)
-    words = [w for w in words if w not in _STOPWORDS]
-    words = lemmatize_list(words)
-    return " ".join(words)
-# =========================
-# Twitter-aware cleaning
-# =========================
-url_re = re.compile(r"https?://\S+|www\.\S+")
-mention_re = re.compile(r"@\w+")
-hashtag_re = re.compile(r"#(\w+)")
-rt_re = re.compile(r"\brt\b", re.IGNORECASE)
-amp_re = re.compile(r"\bamp\b", re.IGNORECASE)
-def twitter_clean(text: str) -> str:
-    if not text: return ""
-    s = url_re.sub("", text)
-    s = mention_re.sub("", s)
-    s = hashtag_re.sub(lambda m: m.group(1), s)  # keep hashtag word
-    s = rt_re.sub("", s)
-    s = amp_re.sub("", s)
-    s = s.replace("U.S.", "US").replace("u.s.", "us")
-    return re.sub(r"\s+", " ", s).strip()
-# =========================
-# Sentiment backends
-# =========================
-_sia = SentimentIntensityAnalyzer()
-ROBERTA_ID = "cardiffnlp/twitter-roberta-base-sentiment-latest"
-_roberta_tok = None
-_roberta_model = None
-def _load_roberta():
-    global _roberta_tok, _roberta_model
-    if not TRANSFORMERS_AVAILABLE:
-        return False
-    if _roberta_model is None:
-        _roberta_tok = AutoTokenizer.from_pretrained(ROBERTA_ID)
-        _roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_ID)
-        _roberta_model.eval()
-    return True
-def vader_scores(text: str) -> Dict[str, float]:
-    s = twitter_clean(text)
-    sc = _sia.polarity_scores(s)
-    return sc  # keys: neg, neu, pos, compound
-def roberta_scores(text: str) -> Optional[Dict[str, float]]:
-    if not _load_roberta():
-        return None
-    s = twitter_clean(text)
-    inputs = _roberta_tok(s, return_tensors="pt", truncation=True, max_length=256)
-    with torch.no_grad():
-        logits = _roberta_model(**inputs).logits
-    probs = F.softmax(logits, dim=1).squeeze().cpu().tolist()
-    # Map to VADER-like schema; define compound = pos - neg
-    return {"neg": float(probs[0]), "neu": float(probs[1]), "pos": float(probs[2]), "compound": float(probs[2] - probs[0])}
-def score_text(text: str, model_name: str) -> Dict[str, float]:
-    if model_name == "Twitter-RoBERTa":
-        sc = roberta_scores(text)
-        if sc is not None:
-            return sc
-    return vader_scores(text)
-def label_from_compound(c: float, pos_thr: float = 0.05, neg_thr: float = -0.05) -> str:
-    if c >= pos_thr: return "positive"
-    if c <= neg_thr: return "negative"
-    return "neutral"
-# =========================
-# Visual helpers (matplotlib; default colors only)
-# =========================
-def plot_sentiment_bar(scores: Dict[str, float]):
-    fig = plt.figure(figsize=(4.8, 3.0))
-    keys = ["neg","neu","pos","compound"]
-    vals_adj = [scores["neg"], scores["neu"], scores["pos"], (scores["compound"] + 1) / 2]
-    plt.bar(keys, vals_adj)
-    plt.title("Sentiment Scores")
-    plt.ylim(0, 1)
-    return fig
-def plot_hist(vals: List[float], title: str, bins: int = 20):
-    fig = plt.figure(figsize=(6,3))
-    plt.hist(vals, bins=bins)
-    plt.title(title)
-    plt.xlabel("compound")
-    plt.ylabel("frequency")
-    plt.tight_layout()
-    return fig
-def plot_counts(labels: List[str], title: str):
-    fig = plt.figure(figsize=(6,3))
-    series = pd.Series(labels).value_counts().reindex(["negative","neutral","positive"]).fillna(0)
-    plt.bar(series.index.astype(str), series.values.astype(int))
-    plt.title(title)
-    plt.xlabel("label")
-    plt.ylabel("count")
-    plt.tight_layout()
-    return fig
-def plot_top_bar(pairs: List[Tuple[str,int]], title: str, rotate: int = 45):
-    fig = plt.figure(figsize=(8,3.5))
-    if pairs:
-        labels, values = zip(*pairs)
-        plt.bar(labels, values)
-        plt.xticks(rotation=rotate, ha="right")
-    plt.title(title)
-    plt.tight_layout()
     return fig
-from wordcloud import WordCloud
-from PIL import Image
-def wordcloud_from_tokens(tokens: List[str]):
-    text = " ".join(tokens)
-    if not text.strip():
-        return Image.new("RGB", (800, 400), color=(255,255,255))
-    wc = WordCloud(width=800, height=400, background_color="white")
-    return wc.generate(text).to_image()
-# =========================
-# Token analytics
-# =========================
-def tokens_from_texts(texts: List[str]) -> List[str]:
-    all_toks = []
-    for t in texts:
-        s = twitter_clean(t)
-        toks = tokenize(s)
-        toks = [w.lower() for w in toks]
-        toks = [ _punct_re.sub("", w) for w in toks ]
-        toks = [w for w in toks if w and (w not in _STOPWORDS)]
-        toks = [ _lemmatizer.lemmatize(w) for w in toks ]
-        all_toks.extend(toks)
-    return all_toks
-def bigrams(tokens: List[str]):
-    return list(zip(tokens, tokens[1:]))
-# =========================
-# Aspect-based (simple window)
-# =========================
-DEFAULT_ASPECTS = ["tariff","jobs","prices","china","farmers","john", "deere"]
-def aspect_sentiment(texts: List[str], aspects: List[str], model_name: str, window: int = 6):
-    out = {a.lower(): [] for a in aspects}
-    for t in texts:
-        clean = twitter_clean(t)
-        toks = clean.split()
-        for i, tok in enumerate(toks):
-            for a in aspects:
-                key = a.lower().split()[0]
-                if tok.lower() == key:
-                    lo, hi = max(0, i-window), min(len(toks), i+window+1)
-                    chunk = " ".join(toks[lo:hi])
-                    sc = score_text(chunk, model_name)["compound"]
-                    out[a.lower()].append(sc)
-    rows = []
-    for a, vals in out.items():
-        rows.append({
-            "aspect": a,
-            "n": len(vals),
-            "mean_compound": float(np.mean(vals)) if vals else 0.0
-        })
-    df = pd.DataFrame(rows).sort_values(["n","mean_compound"], ascending=[False, False])
-    return df
-# =========================
-# Topic clustering (TF-IDF + k-means)
-# =========================
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.cluster import KMeans
-from sklearn.metrics import classification_report, confusion_matrix
-def cluster_topics(texts: List[str], n_clusters: int, model_name: str):
-    docs = [twitter_clean(t) for t in texts]
-    base_docs = [d for d in docs if len(d.split()) >= 3]
-    if len(base_docs) < max(5, n_clusters):
-        return pd.DataFrame(columns=["cluster","size","mean_compound","top_terms"]), None
-    vec = TfidfVectorizer(max_features=4000, ngram_range=(1,2), stop_words="english")
-    X = vec.fit_transform(base_docs)
-    km = KMeans(n_clusters=n_clusters, n_init="auto", random_state=0)
-    labels = km.fit_predict(X)
-    terms = np.array(vec.get_feature_names_out())
-    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
-    top_terms = {i: ", ".join(terms[order_centroids[i, :8]]) for i in range(n_clusters)}
-    comp = [score_text(d, model_name)["compound"] for d in base_docs]
-    df = pd.DataFrame({"cluster": labels, "doc": base_docs, "compound": comp})
-    agg = df.groupby("cluster")["compound"].agg(["size","mean"]).reset_index().rename(columns={"mean":"mean_compound"})
-    agg["top_terms"] = agg["cluster"].map(top_terms)
-    agg = agg.sort_values("size", ascending=False)
-    fig = plt.figure(figsize=(6,3))
-    plt.bar(agg["cluster"].astype(str), agg["mean_compound"])
-    plt.title("Cluster mean sentiment (compound)")
-    plt.xlabel("cluster")
-    plt.ylabel("mean compound")
-    plt.tight_layout()
-    return agg, fig
-# =========================
-# SINGLE TEXT: step-by-step
-# =========================
-def normalize_with_steps(text: str, model_name: str):
-    if not text or not text.strip():
-        df = pd.DataFrame([{"Step":"No input","Tokens":"[]","As Text":""}])
-        return df, "", pd.DataFrame([{"neg":0,"neu":0,"pos":0,"compound":0}]), None
-    steps = []
-    tokens = tokenize(text); steps.append(("Tokenize", tokens, " ".join(tokens)))
-    tokens = remove_non_ascii(tokens); steps.append(("Remove non-ASCII", tokens, " ".join(tokens)))
-    tokens = to_lowercase(tokens); steps.append(("Lowercase", tokens, " ".join(tokens)))
-    tokens = remove_punctuation(tokens); steps.append(("Remove punctuation", tokens, " ".join(tokens)))
-    tokens = [w for w in tokens if w not in _STOPWORDS]; steps.append(("Remove stopwords", tokens, " ".join(tokens)))
-    tokens = lemmatize_list(tokens); steps.append(("Lemmatize", tokens, " ".join(tokens)))
-    final_text = " ".join(tokens); steps.append(("Final join", tokens, final_text))
-    rows = [{"Step":n, "Tokens":json.dumps(t, ensure_ascii=False), "As Text":s} for n,t,s in steps]
-    steps_df = pd.DataFrame(rows, columns=["Step","Tokens","As Text"])
-    scores = score_text(text, model_name)
-    sent_df = pd.DataFrame([scores])
-    fig = plot_sentiment_bar(scores)
-    return steps_df, final_text, sent_df, fig
-# =========================
-# ANALYSIS CORE (shared by CSV & datasets)
-# =========================
-def detect_text_column(df: pd.DataFrame) -> str:
-    candidates = ["text","tweet","full_text","content","body"]
-    for c in candidates:
-        if c in df.columns: return c
-    for c in df.columns:
-        if df[c].dtype == object:
-            return c
-    return df.columns[0]
-def analyze_df(df_in: pd.DataFrame, model_name: str, pos_thr: float, neg_thr: float,
-               dedup: bool, min_len: int, top_n: int, n_clusters: int,
-               aspects_str: str, gold_series: Optional[pd.Series] = None):
-    df = df_in.copy()
-    text_col = detect_text_column(df)
-    df["raw"] = df[text_col].astype(str)
-    if dedup:
-        df = df.drop_duplicates(subset=["raw"])
-    df = df[df["raw"].str.split().str.len().fillna(0) >= int(min_len)].copy()
-    # Score
-    scs = df["raw"].apply(lambda t: score_text(t, model_name))
-    sent_df = pd.DataFrame(list(scs))
-    df = pd.concat([df.reset_index(drop=True), sent_df.reset_index(drop=True)], axis=1)
-    df["label"] = df["compound"].apply(lambda c: label_from_compound(c, pos_thr, neg_thr))
-    # Summary
-    n = len(df)
-    share_pos = (df["label"]=="positive").mean() if n else 0
-    share_neu = (df["label"]=="neutral").mean() if n else 0
-    share_neg = (df["label"]=="negative").mean() if n else 0
-    extremes = (df["compound"].abs() >= 0.6).mean() if n else 0
-    summary = pd.DataFrame([{
-        "n_tweets": n,
-        "share_positive": round(share_pos,3),
-        "share_neutral": round(share_neu,3),
-        "share_negative": round(share_neg,3),
-        "share_extremes_|compound|>=0.6": round(extremes,3),
-        "compound_mean": round(df["compound"].mean() if n else 0, 4),
-        "compound_std": round(df["compound"].std(ddof=1) if n>1 else 0, 4),
-    }])
-    # Plots
-    hist_fig = plot_hist(df["compound"].tolist(), "Distribution of compound", bins=20)
-    count_fig = plot_counts(df["label"].tolist(), "Tweet sentiment counts")
-    # Tokens
-    toks = tokens_from_texts(df["raw"].tolist())
-    top_words = Counter(toks).most_common(int(top_n))
-    top_bi = Counter(bigrams(toks)).most_common(int(top_n))
-    top_bi_pairs = [(" ".join([a,b]), c) for (a,b), c in top_bi]
-    words_fig = plot_top_bar(top_words, f"Top {top_n} words", rotate=45)
-    bigrams_fig = plot_top_bar(top_bi_pairs, f"Top {top_n} bigrams", rotate=45)
-    wc_img = wordcloud_from_tokens(toks)
-    # Hashtag sentiment
-    all_rows = []
-    for t, comp in zip(df["raw"], df["compound"]):
-        tags = re.findall(r"#(\w+)", t)
-        for tag in tags:
-            all_rows.append((tag.lower(), comp))
-    tag_map = defaultdict(list)
-    for tag, sc in all_rows:
-        tag_map[tag].append(sc)
-    tag_stats = sorted([(k, len(v), float(np.mean(v))) for k, v in tag_map.items()],
-                       key=lambda x: x[1], reverse=True)[:top_n]
-    tag_df = pd.DataFrame(tag_stats, columns=["hashtag","count","mean_compound"])
-    tag_fig = plot_top_bar([(h, c) for h,c,_ in tag_stats], "Top hashtags (by count)", rotate=45)
-    # Aspects
-    aspects = [a.strip() for a in (aspects_str or "").split(",") if a.strip()] or DEFAULT_ASPECTS
-    asp_df = aspect_sentiment(df["raw"].tolist(), aspects, model_name)
-    # Clusters
-    cluster_tbl, cluster_fig = cluster_topics(df["raw"].tolist(), int(n_clusters), model_name)
-    # Evaluation vs gold labels (if provided)
-    report_df = pd.DataFrame()
-    cm_fig = None
-    if gold_series is not None and len(gold_series) == len(df):
-        y_true = gold_series.tolist()
-        # Drop rows with unknown gold
-        mask = pd.Series([y in {"negative","neutral","positive"} for y in y_true])
-        y_true = pd.Series(y_true)[mask].tolist()
-        y_pred = df["label"][mask.values].tolist()
-        if y_true:
-            report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
-            report_df = pd.DataFrame(report).transpose().reset_index().rename(columns={"index":"class"})
-            labels_order = ["negative","neutral","positive"]
-            cm = confusion_matrix(y_true, y_pred, labels=labels_order)
-            fig = plt.figure(figsize=(4.5,3.8))
-            plt.imshow(cm, interpolation="nearest")
-            plt.title("Confusion matrix")
-            plt.xticks(range(len(labels_order)), labels_order, rotation=45, ha="right")
-            plt.yticks(range(len(labels_order)), labels_order)
-            for i in range(cm.shape[0]):
-                for j in range(cm.shape[1]):
-                    plt.text(j, i, str(cm[i, j]), ha="center", va="center")
-            plt.tight_layout()
-            cm_fig = fig
-    # Output file
-    out_csv = "tweets_with_sentiment.csv"
-    df.to_csv(out_csv, index=False)
-    return (
-        summary,
-        hist_fig, count_fig,
-        words_fig, bigrams_fig, wc_img,
-        tag_df, tag_fig,
-        asp_df,
-        cluster_tbl, cluster_fig,
-        out_csv,
-        report_df, cm_fig
-    )
-# =========================
-# CSV entry point (wrap analyze_df)
-# =========================
-def analyze_csv(file, model_name: str, pos_thr: float, neg_thr: float,
-                dedup: bool, min_len: int, top_n: int, n_clusters: int,
-                aspects_str: str):
-    if file is None:
         return (
-            gr.update(value=pd.DataFrame([{"info":"Upload a CSV file with a tweet text column."}])),
-            None, None, None, None, None, None, None, None, None,
-            None,  # out_file
-            pd.DataFrame(), None  # report, cm_fig
         )
-    df = pd.read_csv(file.name)
-    return analyze_df(df, model_name, pos_thr, neg_thr, dedup, min_len, top_n, n_clusters, aspects_str)
-# =========================
-# DATASETS entry point
-# =========================
-def load_hf_dataset(dataset_name: str, split: str, sample_n: int, keyword: str, random_sample: bool):
-    if not DATASETS_AVAILABLE:
-        raise RuntimeError("The 'datasets' library is not available in this Space.")
-    if dataset_name == "Sentiment140":
-        # Split choices on HF are often train only; accept 'train' fallback
-        ds = load_dataset("sentiment140", split=split or "train")
-        df = ds.to_pandas()
-        text_col = "text" if "text" in df.columns else detect_text_column(df)
-        gold = None
-        # sentiment140 labels: 0=neg, 4=pos (no neutral)
-        if "sentiment" in df.columns:
-            gold_map = {0: "negative", 4: "positive"}
-            gold = df["sentiment"].map(gold_map).fillna("neutral")
-        df = df.rename(columns={text_col: "text"})[["text"]].copy()
-    elif dataset_name == "TweetEval (sentiment)":
-        ds = load_dataset("tweet_eval", "sentiment", split=split or "test")
-        df = ds.to_pandas()
-        # labels: 0=neg, 1=neu, 2=pos
-        label_map = {0:"negative", 1:"neutral", 2:"positive"}
-        gold = df["label"].map(label_map)
-        df = df.rename(columns={"text": "text"})[["text"]].copy()
-    else:
-        raise ValueError("Unknown dataset.")
-    if keyword:
-        df = df[df["text"].str.contains(keyword, case=False, na=False)]
-        if gold is not None:
-            gold = gold.loc[df.index]
-    if sample_n and sample_n > 0 and sample_n < len(df):
-        if random_sample:
-            df = df.sample(n=sample_n, random_state=0)
-        else:
-            df = df.head(sample_n)
-        if gold is not None:
-            gold = gold.loc[df.index]
-    gold = gold.reset_index(drop=True) if gold is not None else None
-    return df.reset_index(drop=True), gold
-def analyze_dataset(dataset_name: str, split: str, sample_n: int, keyword: str, random_sample: bool,
-                    model_name: str, pos_thr: float, neg_thr: float,
-                    dedup: bool, min_len: int, top_n: int, n_clusters: int,
-                    aspects_str: str):
-    try:
-        df, gold = load_hf_dataset(dataset_name, split, sample_n, keyword, random_sample)
-    except Exception as e:
-        msg = pd.DataFrame([{"error": str(e)}])
-        return (msg, None, None, None, None, None, None, None, None, None, None,
-                None, pd.DataFrame(), None)
-    results = analyze_df(df, model_name, pos_thr, neg_thr, dedup, min_len, top_n, n_clusters, aspects_str, gold_series=gold)
-    # Prepend a small preview table of the dataset
-    preview = df.head(10)
-    return (preview, *results)
-# =========================
-# UI
-# =========================
-EXAMPLES = [
-    "Cats, DOGS!!! aren't running; they're sleeping.",
-    "U.S. tariffs on steel & aluminum — what's next?",
-    "This movie was absolutely amazing—loved every scene!",
-    "Service was terrible; I’m never coming back."
-]
-with gr.Blocks(title="Normalization + Twitter Sentiment Workbench") as demo:
-    gr.Markdown("# 🔤 Normalization + 📊 Sentiment (Twitter) Workbench")
-    gr.Markdown(
-        "Switch between **VADER** and **Twitter-RoBERTa**; analyze CSVs or pull open datasets "
-        "(*Sentiment140*, *TweetEval*). Tune thresholds, inspect tokens/hashtags/aspects, and "
-        "benchmark against gold labels when available."
     )
-    # ----- Single text -----
-    with gr.Tab("Single Text"):
-        with gr.Row():
-            model_dd = gr.Dropdown(["VADER","Twitter-RoBERTa"], value="VADER", label="Sentiment model")
-        inp = gr.Textbox(label="Input text", lines=5, placeholder="Type or pick an example…")
-        gr.Examples(examples=EXAMPLES, inputs=[inp])
-        run_btn = gr.Button("Normalize & Analyze", variant="primary")
-        steps_out = gr.Dataframe(headers=["Step","Tokens","As Text"], label="Step-by-step", interactive=False)
-        final_out = gr.Textbox(label="Final normalized output", interactive=False)
-        sent_df = gr.Dataframe(label="Sentiment scores", interactive=False)
-        sent_plot = gr.Plot(label="Sentiment (bar plot)")
-        run_btn.click(fn=normalize_with_steps, inputs=[inp, model_dd],
-                      outputs=[steps_out, final_out, sent_df, sent_plot])
-    # ----- Batch CSV -----
-    with gr.Tab("Batch Tweets (CSV)"):
-        gr.Markdown("Upload a CSV with a tweet text column (auto-detected).")
-        with gr.Row():
-            file_up = gr.File(file_types=[".csv"], label="Upload CSV")
-            model_csv = gr.Dropdown(["VADER","Twitter-RoBERTa"], value="VADER", label="Model")
-            pos_thr = gr.Slider(0.0, 0.5, value=0.05, step=0.01, label="Positive threshold (compound ≥)")
-            neg_thr = gr.Slider(-0.5, 0.0, value=-0.05, step=0.01, label="Negative threshold (compound ≤)")
-        with gr.Row():
-            dedup = gr.Checkbox(value=True, label="Drop duplicate tweets")
-            min_len = gr.Slider(0, 10, value=3, step=1, label="Min token length (filter)")
-            top_n = gr.Slider(5, 30, value=15, step=1, label="Top-N for words/bigrams/hashtags")
-            n_clusters = gr.Slider(2, 8, value=4, step=1, label="Topic clusters (k-means)")
-        aspects = gr.Textbox(value="tariff, jobs, prices, china, farmers, john deere",
-                             label="Aspects (comma-separated)")
-        go = gr.Button("Analyze CSV", variant="primary")
-        summary_table = gr.Dataframe(label="Summary", interactive=False)
-        hist_fig = gr.Plot(label="Distribution of compound")
-        count_fig = gr.Plot(label="Sentiment counts")
-        with gr.Row():
-            words_fig = gr.Plot(label="Top words")
-            bigrams_fig = gr.Plot(label="Top bigrams")
-        wc_img = gr.Image(label="Word cloud", type="pil")
-        with gr.Row():
-            tag_df = gr.Dataframe(label="Hashtag sentiment (count & mean compound)", interactive=False)
-            tag_fig = gr.Plot(label="Top hashtags (by count)")
-        asp_df = gr.Dataframe(label="Aspect sentiment (windowed)", interactive=False)
-        with gr.Row():
-            cluster_tbl = gr.Dataframe(label="Topic clusters (size & mean compound + top terms)", interactive=False)
-            cluster_fig = gr.Plot(label="Cluster mean sentiment")
-        out_file = gr.File(label="Download augmented CSV")
-        report_df = gr.Dataframe(label="Benchmark vs gold labels (if present)", interactive=False)
-        cm_plot = gr.Plot(label="Confusion matrix (if gold labels present)")
-        go.click(
-            fn=analyze_csv,
-            inputs=[file_up, model_csv, pos_thr, neg_thr, dedup, min_len, top_n, n_clusters, aspects],
-            outputs=[
-                summary_table,
-                hist_fig, count_fig,
-                words_fig, bigrams_fig, wc_img,
-                tag_df, tag_fig,
-                asp_df,
-                cluster_tbl, cluster_fig,
-                out_file,
-                report_df, cm_plot
-            ],
-            show_progress=True
-        )
-    # ----- Datasets -----
-    with gr.Tab("Datasets (Sentiment140 / TweetEval)"):
-        gr.Markdown(
-            "Download open tweet datasets—no account required. Optionally filter by keyword and sample size, "
-            "then analyze and (when available) benchmark against gold labels."
-        )
-        with gr.Row():
-            ds_name = gr.Dropdown(
-                ["Sentiment140", "TweetEval (sentiment)"],
-                value="TweetEval (sentiment)",
-                label="Dataset"
-            )
-            ds_split = gr.Textbox(value="test", label="Split (e.g., train / validation / test)",)
-            sample_n = gr.Slider(0, 20000, value=2000, step=100, label="Sample size (0 = all)")
-            keyword = gr.Textbox(value="", label="Keyword filter (optional)")
-            rnd = gr.Checkbox(value=True, label="Random sample")
-        with gr.Row():
-            model_ds = gr.Dropdown(["VADER","Twitter-RoBERTa"], value="VADER", label="Model")
-            pos_thr_ds = gr.Slider(0.0, 0.5, value=0.05, step=0.01, label="Positive threshold (compound ≥)")
-            neg_thr_ds = gr.Slider(-0.5, 0.0, value=-0.05, step=0.01, label="Negative threshold (compound ≤)")
-        with gr.Row():
-            dedup_ds = gr.Checkbox(value=True, label="Drop duplicate tweets")
-            min_len_ds = gr.Slider(0, 10, value=3, step=1, label="Min token length (filter)")
-            top_n_ds = gr.Slider(5, 30, value=15, step=1, label="Top-N words/bigrams/hashtags")
-            n_clusters_ds = gr.Slider(2, 8, value=4, step=1, label="Topic clusters (k-means)")
-        aspects_ds = gr.Textbox(value="tariff, jobs, prices, china, farmers, john deere",
-                                label="Aspects (comma-separated)")
-        fetch = gr.Button("Load & Analyze Dataset", variant="primary")
-        preview = gr.Dataframe(label="Dataset preview (first rows)", interactive=False)
-        summary_table_ds = gr.Dataframe(label="Summary", interactive=False)
-        hist_fig_ds = gr.Plot(label="Distribution of compound")
-        count_fig_ds = gr.Plot(label="Sentiment counts")
-        with gr.Row():
-            words_fig_ds = gr.Plot(label="Top words")
-            bigrams_fig_ds = gr.Plot(label="Top bigrams")
-        wc_img_ds = gr.Image(label="Word cloud", type="pil")
-        with gr.Row():
-            tag_df_ds = gr.Dataframe(label="Hashtag sentiment (count & mean compound)", interactive=False)
-            tag_fig_ds = gr.Plot(label="Top hashtags (by count)")
-        asp_df_ds = gr.Dataframe(label="Aspect sentiment (windowed)", interactive=False)
-        with gr.Row():
-            cluster_tbl_ds = gr.Dataframe(label="Topic clusters (size & mean compound + top terms)", interactive=False)
-            cluster_fig_ds = gr.Plot(label="Cluster mean sentiment")
-        out_file_ds = gr.File(label="Download augmented CSV")
-        report_df_ds = gr.Dataframe(label="Benchmark vs gold labels", interactive=False)
-        cm_plot_ds = gr.Plot(label="Confusion matrix")
-        fetch.click(
-            fn=analyze_dataset,
-            inputs=[ds_name, ds_split, sample_n, keyword, rnd,
-                    model_ds, pos_thr_ds, neg_thr_ds, dedup_ds, min_len_ds, top_n_ds, n_clusters_ds, aspects_ds],
-            outputs=[
-                preview,
-                summary_table_ds,
-                hist_fig_ds, count_fig_ds,
-                words_fig_ds, bigrams_fig_ds, wc_img_ds,
-                tag_df_ds, tag_fig_ds,
-                asp_df_ds,
-                cluster_tbl_ds, cluster_fig_ds,
-                out_file_ds,
-                report_df_ds, cm_plot_ds
-            ],
-            show_progress=True
-        )
     gr.Markdown(
-        "> Notes: RoBERTa downloads the model on first run. For Sentiment140, gold labels are "
-        "mapped as 0→negative, 4→positive (no neutral). TweetEval has gold labels for all three classes."
     )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import pandas as pd
 import numpy as np
+import re
+from typing import List, Tuple
+# Lazy imports for heavy deps so the Space boots faster
+from functools import lru_cache
+def _lazy_imports():
+    global datasets, pipeline, WordCloud, plt
+    import matplotlib.pyplot as plt  # noqa: F401
+    from datasets import load_dataset  # noqa: F401
+    from transformers import pipeline as hf_pipeline  # noqa: F401
     try:
+        from wordcloud import WordCloud  # noqa: F401
     except Exception:
+        WordCloud = None
+    return locals()
+# ----------------------------
+# Helpers
+# ----------------------------
+TARIFF_KEYWORDS_DEFAULT = [
+    "tariff", "tariffs", "import tax", "trade war", "section 301", "section301",
+    "customs duty", "custom duties", "duties", "anti-dumping", "countervailing",
+    "steel tariff", "aluminum tariff", "aluminium tariff", "US tariff", "U.S. tariff",
+    "tariff policy", "retaliatory tariff", "tariff hike", "tariff cut"
+]
+KEYWORD_PATTERN_CACHE = {}
+def compile_keyword_pattern(keywords: List[str]) -> re.Pattern:
+    key = "\u0001".join(sorted([k.strip().lower() for k in keywords if k.strip()]))
+    if key in KEYWORD_PATTERN_CACHE:
+        return KEYWORD_PATTERN_CACHE[key]
+    escaped = [re.escape(k) for k in keywords if k.strip()]
+    pattern = re.compile(r"(" + r"|".join(escaped) + r")", flags=re.IGNORECASE)
+    KEYWORD_PATTERN_CACHE[key] = pattern
+    return pattern
+def normalize_text(s: str) -> str:
+    s = re.sub(r"https?://\S+", " ", s)  # drop urls
+    s = re.sub(r"@[A-Za-z0-9_]+", " ", s)  # drop @mentions
+    s = re.sub(r"#[A-Za-z0-9_]+", " ", s)  # drop hashtags (we'll match keywords separately)
+    s = re.sub(r"\s+", " ", s).strip()
+    return s
+@lru_cache(maxsize=2)
+def load_sentiment_pipeline(model_name: str = "cardiffnlp/twitter-roberta-base-sentiment-latest"):
+    _ = _lazy_imports()
+    from transformers import pipeline as hf_pipeline
+    pipe = hf_pipeline(
+        task="sentiment-analysis",
+        model=model_name,
+        tokenizer=model_name,
+        truncation=True,
+        max_length=256,
+        return_all_scores=False,
+        device=-1,
+    )
+    return pipe
+@lru_cache(maxsize=2)
+def load_hf_dataset(name: str):
+    _ = _lazy_imports()
+    from datasets import load_dataset
+    if name == "sentiment140":
+        # 1.6M tweets; we'll stream and sample later
+        ds = load_dataset("sentiment140")
+        # columns: ['sentiment','ids','date','query','user','text']
+        return ds
+    elif name == "tweet_eval":
+        # We'll use the sentiment subset
+        ds = load_dataset("tweet_eval", "sentiment")
+        # columns: ['text','label'] where label in {0:negative,1:neutral,2:positive}
+        return ds
+    else:
+        raise ValueError("Unsupported dataset: " + name)
+def filter_and_sample(df: pd.DataFrame, keywords: List[str], sample_size: int, random_state: int = 42) -> pd.DataFrame:
+    pat = compile_keyword_pattern(keywords)
+    mask = df['text'].str.contains(pat, na=False)
+    subset = df.loc[mask].copy()
+    if subset.empty:
+        return subset
+    if sample_size > 0 and len(subset) > sample_size:
+        subset = subset.sample(n=sample_size, random_state=random_state)
+    return subset
+def run_inference(texts: List[str], batch_size: int = 64) -> List[dict]:
+    pipe = load_sentiment_pipeline()
+    results = []
+    for i in range(0, len(texts), batch_size):
+        batch = texts[i:i+batch_size]
+        out = pipe(batch)
+        # normalize labels to {positive, neutral, negative}
+        for o in out:
+            lab = o.get('label', '').lower()
+            if 'pos' in lab:
+                label = 'positive'
+            elif 'neg' in lab:
+                label = 'negative'
+            else:
+                label = 'neutral'
+            results.append({'label': label, 'score': float(o.get('score', 0.0))})
+    return results
+def make_bar_plot(counts: pd.Series):
+    import matplotlib.pyplot as plt
+    fig = plt.figure(figsize=(5, 3.2), dpi=140)
+    ax = fig.gca()
+    counts = counts.reindex(['negative', 'neutral', 'positive']).fillna(0)
+    ax.bar(counts.index, counts.values)
+    total = int(counts.sum())
+    ax.set_title(f"Sentiment distribution (n={total})")
+    ax.set_xlabel("Sentiment")
+    ax.set_ylabel("# Tweets")
+    fig.tight_layout()
     return fig
+def make_wordcloud(texts: List[str]):
+    # Optional; will return None if wordcloud isn't available
+    try:
+        from wordcloud import WordCloud
+    except Exception:
+        return None
+    joined = " ".join(texts)
+    wc = WordCloud(width=800, height=320, background_color="white").generate(joined)
+    import matplotlib.pyplot as plt
+    fig = plt.figure(figsize=(8, 3.6), dpi=120)
+    plt.imshow(wc)
+    plt.axis("off")
+    fig.tight_layout()
+    return fig
+# ----------------------------
+# Core pipeline
+# ----------------------------
+def analyze(dataset_choice: str,
+            keywords_csv: str,
+            max_rows: int,
+            include_wordcloud: bool) -> Tuple[str, "matplotlib.figure.Figure", "matplotlib.figure.Figure", pd.DataFrame]:
+    """Return (summary_markdown, bar_fig, wordcloud_fig|None, table_df)"""
+    ds = load_hf_dataset(dataset_choice)
+    # Convert to pandas
+    if dataset_choice == "sentiment140":
+        # concatenate a manageable slice from train/test (to keep runtime reasonable)
+        train = ds.get('train')
+        test = ds.get('test')
+        frames = []
+        for split in [train, test]:
+            if split is None:
+                continue
+            # Take a small random slice to keep Space responsive
+            n = len(split)
+            take = min(n, 150_000)  # cap
+            frames.append(split.shuffle(seed=42).select(range(take)).to_pandas()[['text', 'date']])
+        df = pd.concat(frames, ignore_index=True)
+    else:
+        # tweet_eval sentiment
+        frames = []
+        for name in ['train', 'validation', 'test']:
+            if name in ds:
+                frames.append(ds[name].to_pandas()[['text']])
+        df = pd.concat(frames, ignore_index=True)
+        if 'date' not in df.columns:
+            df['date'] = np.nan
+    # Clean
+    df['text'] = df['text'].astype(str).apply(normalize_text)
+    # Keywords
+    keywords = [k.strip() for k in (keywords_csv or "").split(',') if k.strip()] or TARIFF_KEYWORDS_DEFAULT
+    # Filter + sample
+    subset = filter_and_sample(df, keywords, sample_size=max_rows)
+    if subset.empty:
         return (
+            "### No matches found\nTry broadening keywords or increasing the sample size.",
+            make_bar_plot(pd.Series(dtype=int)),
+            None,
+            pd.DataFrame(columns=['text','pred_label','pred_score','date'])
         )
+    # Inference
+    preds = run_inference(subset['text'].tolist())
+    pred_df = pd.DataFrame(preds)
+    subset = subset.reset_index(drop=True).copy()
+    subset['pred_label'] = pred_df['label']
+    subset['pred_score'] = pred_df['score']
+    # Metrics
+    counts = subset['pred_label'].value_counts()
+    total = int(counts.sum())
+    pct = (counts / max(total, 1) * 100).round(1)
+    # Summary text
+    sentiment_line = (
+        f"**Negative:** {int(counts.get('negative', 0))} ({pct.get('negative', 0.0)}%)  |  "
+        f"**Neutral:** {int(counts.get('neutral', 0))} ({pct.get('neutral', 0.0)}%)  |  "
+        f"**Positive:** {int(counts.get('positive', 0))} ({pct.get('positive', 0.0)}%)"
     )
+    summary = (
+        "## Tariff Tweet Sentiment — Snapshot\n"
+        f"Dataset: **{dataset_choice}**  |  Sampled tweets: **{total}**\n\n"
+        f"Keyword filter: `{', '.join(keywords)}`\n\n"
+        + sentiment_line +
+        "\n\nTip: Neutral can be high when tweets are mostly informative (news/links) or ambiguous."
+    )
+    # Plots
+    bar_fig = make_bar_plot(counts)
+    wc_fig = make_wordcloud(subset['text'].tolist()) if include_wordcloud else None
+    # Output table (limit rows for UI responsiveness)
+    out_df = subset[['text','pred_label','pred_score','date']]
+    return summary, bar_fig, wc_fig, out_df
+# ----------------------------
+# Gradio UI
+# ----------------------------
+with gr.Blocks(title="Tariff Tweet Sentiment (No Twitter API)") as demo:
     gr.Markdown(
+        """
+        # Tariff Tweet Sentiment
+        Analyze how people talk about **U.S. tariff policy** using public Twitter corpora (no API key required).
+        **How it works**
+        - Choose a public dataset (e.g., `sentiment140` or `tweet_eval/sentiment`).
+        - Filter tweets by keywords like *tariff*, *trade war*, *Section 301*, etc.
+        - Run a Twitter-optimized sentiment model.
+        - View distribution, word cloud, and the matching tweets.
+        *Note:* Public corpora may skew older or topical; results are a **snapshot**, not a live feed.
+        """
     )
+    with gr.Row():
+        dataset_choice = gr.Dropdown(
+            choices=["sentiment140", "tweet_eval"],
+            value="sentiment140",
+            label="Dataset"
+        )
+        max_rows = gr.Slider(100, 5000, value=1500, step=50, label="Max tweets to analyze (after keyword filter)")
+    keywords_csv = gr.Textbox(value=", ".join(TARIFF_KEYWORDS_DEFAULT), label="Keywords (comma‑separated)")
+    include_wordcloud = gr.Checkbox(value=True, label="Include word cloud (optional)")
+    run_btn = gr.Button("Run Analysis", variant="primary")
+    summary_md = gr.Markdown()
+    bar_plot = gr.Plot(label="Sentiment distribution")
+    wc_plot = gr.Plot(label="Word cloud (optional)")
+    table = gr.Dataframe(headers=["text","pred_label","pred_score","date"], wrap=True, interactive=False)
+    csv = gr.File(label="Download CSV of results", visible=True)
+    def _go(dataset_choice, keywords_csv, max_rows, include_wordcloud):
+        summary, bar_fig, wc_fig, df = analyze(dataset_choice, keywords_csv, int(max_rows), bool(include_wordcloud))
+        # Save CSV
+        out_path = "tariff_tweets_sentiment.csv"
+        df.to_csv(out_path, index=False)
+        return summary, bar_fig, wc_fig, df, out_path
+    run_btn.click(_go, [dataset_choice, keywords_csv, max_rows, include_wordcloud], [summary_md, bar_plot, wc_plot, table, csv])
 if __name__ == "__main__":
     demo.launch()