import gradio as gr import pandas as pd import numpy as np import re from typing import List, Tuple # Lazy imports for heavy deps so the Space boots faster from functools import lru_cache def _lazy_imports(): global datasets, pipeline, WordCloud, plt import matplotlib.pyplot as plt # noqa: F401 from datasets import load_dataset # noqa: F401 from transformers import pipeline as hf_pipeline # noqa: F401 try: from wordcloud import WordCloud # noqa: F401 except Exception: WordCloud = None return locals() # ---------------------------- # Helpers # ---------------------------- TARIFF_KEYWORDS_DEFAULT = [ "tariff", "tariffs", "import tax", "trade war", "section 301", "section301", "customs duty", "custom duties", "duties", "anti-dumping", "countervailing", "steel tariff", "aluminum tariff", "aluminium tariff", "US tariff", "U.S. tariff", "tariff policy", "retaliatory tariff", "tariff hike", "tariff cut" ] KEYWORD_PATTERN_CACHE = {} def compile_keyword_pattern(keywords: List[str]) -> re.Pattern: key = "\u0001".join(sorted([k.strip().lower() for k in keywords if k.strip()])) if key in KEYWORD_PATTERN_CACHE: return KEYWORD_PATTERN_CACHE[key] escaped = [re.escape(k) for k in keywords if k.strip()] pattern = re.compile(r"(" + r"|".join(escaped) + r")", flags=re.IGNORECASE) KEYWORD_PATTERN_CACHE[key] = pattern return pattern def normalize_text(s: str) -> str: s = re.sub(r"https?://\S+", " ", s) # drop urls s = re.sub(r"@[A-Za-z0-9_]+", " ", s) # drop @mentions s = re.sub(r"#[A-Za-z0-9_]+", " ", s) # drop hashtags (we'll match keywords separately) s = re.sub(r"\s+", " ", s).strip() return s @lru_cache(maxsize=2) def load_sentiment_pipeline(model_name: str = "cardiffnlp/twitter-roberta-base-sentiment-latest"): _ = _lazy_imports() from transformers import pipeline as hf_pipeline pipe = hf_pipeline( task="sentiment-analysis", model=model_name, tokenizer=model_name, truncation=True, max_length=256, return_all_scores=False, device=-1, ) return pipe @lru_cache(maxsize=2) def load_hf_dataset(name: str): _ = _lazy_imports() from datasets import load_dataset if name == "sentiment140": # 1.6M tweets; we'll stream and sample later ds = load_dataset("sentiment140", trust_remote_code=True) # columns: ['sentiment','ids','date','query','user','text'] return ds elif name == "tweet_eval": # We'll use the sentiment subset ds = load_dataset("tweet_eval", "sentiment") # columns: ['text','label'] where label in {0:negative,1:neutral,2:positive} return ds else: raise ValueError("Unsupported dataset: " + name) def filter_and_sample(df: pd.DataFrame, keywords: List[str], sample_size: int, random_state: int = 42) -> pd.DataFrame: pat = compile_keyword_pattern(keywords) mask = df['text'].str.contains(pat, na=False) subset = df.loc[mask].copy() if subset.empty: return subset if sample_size > 0 and len(subset) > sample_size: subset = subset.sample(n=sample_size, random_state=random_state) return subset def run_inference(texts: List[str], batch_size: int = 64) -> List[dict]: pipe = load_sentiment_pipeline() results = [] for i in range(0, len(texts), batch_size): batch = texts[i:i+batch_size] out = pipe(batch) # normalize labels to {positive, neutral, negative} for o in out: lab = o.get('label', '').lower() if 'pos' in lab: label = 'positive' elif 'neg' in lab: label = 'negative' else: label = 'neutral' results.append({'label': label, 'score': float(o.get('score', 0.0))}) return results def make_bar_plot(counts: pd.Series): import matplotlib.pyplot as plt fig = plt.figure(figsize=(5, 3.2), dpi=140) ax = fig.gca() counts = counts.reindex(['negative', 'neutral', 'positive']).fillna(0) ax.bar(counts.index, counts.values) total = int(counts.sum()) ax.set_title(f"Sentiment distribution (n={total})") ax.set_xlabel("Sentiment") ax.set_ylabel("# Tweets") fig.tight_layout() return fig def make_wordcloud(texts: List[str]): # Optional; will return None if wordcloud isn't available try: from wordcloud import WordCloud except Exception: return None joined = " ".join(texts) wc = WordCloud(width=800, height=320, background_color="white").generate(joined) import matplotlib.pyplot as plt fig = plt.figure(figsize=(8, 3.6), dpi=120) plt.imshow(wc) plt.axis("off") fig.tight_layout() return fig # ---------------------------- # Core pipeline # ---------------------------- def analyze(dataset_choice: str, keywords_csv: str, max_rows: int, include_wordcloud: bool) -> Tuple[str, "matplotlib.figure.Figure", "matplotlib.figure.Figure", pd.DataFrame]: """Return (summary_markdown, bar_fig, wordcloud_fig|None, table_df)""" ds = load_hf_dataset(dataset_choice) # Convert to pandas if dataset_choice == "sentiment140": # concatenate a manageable slice from train/test (to keep runtime reasonable) train = ds.get('train') test = ds.get('test') frames = [] for split in [train, test]: if split is None: continue # Take a small random slice to keep Space responsive n = len(split) take = min(n, 150_000) # cap frames.append(split.shuffle(seed=42).select(range(take)).to_pandas()[['text', 'date']]) df = pd.concat(frames, ignore_index=True) else: # tweet_eval sentiment frames = [] for name in ['train', 'validation', 'test']: if name in ds: frames.append(ds[name].to_pandas()[['text']]) df = pd.concat(frames, ignore_index=True) if 'date' not in df.columns: df['date'] = np.nan # Clean df['text'] = df['text'].astype(str).apply(normalize_text) # Keywords keywords = [k.strip() for k in (keywords_csv or "").split(',') if k.strip()] or TARIFF_KEYWORDS_DEFAULT # Filter + sample subset = filter_and_sample(df, keywords, sample_size=max_rows) if subset.empty: return ( "### No matches found\nTry broadening keywords or increasing the sample size.", make_bar_plot(pd.Series(dtype=int)), None, pd.DataFrame(columns=['text','pred_label','pred_score','date']) ) # Inference preds = run_inference(subset['text'].tolist()) pred_df = pd.DataFrame(preds) subset = subset.reset_index(drop=True).copy() subset['pred_label'] = pred_df['label'] subset['pred_score'] = pred_df['score'] # Metrics counts = subset['pred_label'].value_counts() total = int(counts.sum()) pct = (counts / max(total, 1) * 100).round(1) # Summary text sentiment_line = ( f"**Negative:** {int(counts.get('negative', 0))} ({pct.get('negative', 0.0)}%) | " f"**Neutral:** {int(counts.get('neutral', 0))} ({pct.get('neutral', 0.0)}%) | " f"**Positive:** {int(counts.get('positive', 0))} ({pct.get('positive', 0.0)}%)" ) summary = ( "## Tariff Tweet Sentiment — Snapshot\n" f"Dataset: **{dataset_choice}** | Sampled tweets: **{total}**\n\n" f"Keyword filter: `{', '.join(keywords)}`\n\n" + sentiment_line + "\n\nTip: Neutral can be high when tweets are mostly informative (news/links) or ambiguous." ) # Plots bar_fig = make_bar_plot(counts) wc_fig = make_wordcloud(subset['text'].tolist()) if include_wordcloud else None # Output table (limit rows for UI responsiveness) out_df = subset[['text','pred_label','pred_score','date']] return summary, bar_fig, wc_fig, out_df # ---------------------------- # Gradio UI # ---------------------------- with gr.Blocks(title="Tariff Tweet Sentiment (No Twitter API)") as demo: gr.Markdown( """ # Tariff Tweet Sentiment Analyze how people talk about **U.S. tariff policy** using public Twitter corpora (no API key required). **How it works** - Choose a public dataset (e.g., `sentiment140` or `tweet_eval/sentiment`). - Filter tweets by keywords like *tariff*, *trade war*, *Section 301*, etc. - Run a Twitter-optimized sentiment model. - View distribution, word cloud, and the matching tweets. *Note:* Public corpora may skew older or topical; results are a **snapshot**, not a live feed. """ ) with gr.Row(): dataset_choice = gr.Dropdown( choices=["sentiment140", "tweet_eval"], value="sentiment140", label="Dataset" ) max_rows = gr.Slider(100, 5000, value=1500, step=50, label="Max tweets to analyze (after keyword filter)") keywords_csv = gr.Textbox(value=", ".join(TARIFF_KEYWORDS_DEFAULT), label="Keywords (comma‑separated)") include_wordcloud = gr.Checkbox(value=True, label="Include word cloud (optional)") run_btn = gr.Button("Run Analysis", variant="primary") summary_md = gr.Markdown() bar_plot = gr.Plot(label="Sentiment distribution") wc_plot = gr.Plot(label="Word cloud (optional)") table = gr.Dataframe(headers=["text","pred_label","pred_score","date"], wrap=True, interactive=False) csv = gr.File(label="Download CSV of results", visible=True) def _go(dataset_choice, keywords_csv, max_rows, include_wordcloud): summary, bar_fig, wc_fig, df = analyze(dataset_choice, keywords_csv, int(max_rows), bool(include_wordcloud)) # Save CSV out_path = "tariff_tweets_sentiment.csv" df.to_csv(out_path, index=False) return summary, bar_fig, wc_fig, df, out_path run_btn.click(_go, [dataset_choice, keywords_csv, max_rows, include_wordcloud], [summary_md, bar_plot, wc_plot, table, csv]) if __name__ == "__main__": demo.launch()