Tulitula commited on
Commit
bbc558b
Β·
verified Β·
1 Parent(s): 56a394e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +489 -272
app.py CHANGED
@@ -1,5 +1,5 @@
1
- # app.py β€” Efficient Portfolio Advisor (CML-safe, same-ticker suggestions, embeddings always on)
2
- import os, io, math, time, warnings
3
  warnings.filterwarnings("ignore")
4
 
5
  from typing import List, Tuple, Dict, Optional
@@ -18,16 +18,18 @@ os.makedirs(DATA_DIR, exist_ok=True)
18
 
19
  MAX_TICKERS = 30
20
  DEFAULT_LOOKBACK_YEARS = 10
21
- MARKET_TICKER = "VOO"
22
 
23
- SYNTH_ROWS = 1000
24
- EMBED_MODEL_NAME = "FinLang/finance-embeddings-investopedia"
25
- EMBED_ALPHA = 0.6 # exposure similarity weight
26
- MMR_LAMBDA = 0.7 # diversity for re-ranking (kept for consistency even though we output 1 per band)
27
 
 
 
 
 
 
28
  HORIZON_YEARS = 10
29
  RF_CODE = "DGS10"
30
- RF_ANN = 0.0375
31
 
32
  # ---------------- helpers ----------------
33
  def fred_series_for_horizon(years: float) -> str:
@@ -43,7 +45,8 @@ def fred_series_for_horizon(years: float) -> str:
43
  def fetch_fred_yield_annual(code: str) -> float:
44
  url = f"https://fred.stlouisfed.org/graph/fredgraph.csv?id={code}"
45
  try:
46
- r = requests.get(url, timeout=10); r.raise_for_status()
 
47
  df = pd.read_csv(io.StringIO(r.text))
48
  s = pd.to_numeric(df.iloc[:, 1], errors="coerce").dropna()
49
  return float(s.iloc[-1] / 100.0) if len(s) else 0.03
@@ -51,227 +54,344 @@ def fetch_fred_yield_annual(code: str) -> float:
51
  return 0.03
52
 
53
  def fetch_prices_monthly(tickers: List[str], years: int) -> pd.DataFrame:
54
- tickers = list(dict.fromkeys([t.upper().strip() for t in tickers if t]))
55
- start = (pd.Timestamp.today(tz="UTC") - pd.DateOffset(years=int(years), days=7)).date()
56
  end = pd.Timestamp.today(tz="UTC").date()
 
57
  df = yf.download(
58
- tickers, start=start, end=end, interval="1mo",
59
- auto_adjust=True, actions=False, progress=False,
60
- group_by="column", threads=False,
 
 
 
 
 
 
61
  )
62
- if isinstance(df, pd.Series): df = df.to_frame()
 
 
63
  if isinstance(df.columns, pd.MultiIndex):
64
  lvl0 = [str(x) for x in df.columns.get_level_values(0).unique()]
65
- if "Close" in lvl0: df = df["Close"]
66
- elif "Adj Close" in lvl0: df = df["Adj Close"]
67
- else: df = df.xs(df.columns.levels[0][-1], axis=1, level=0, drop_level=True)
 
 
 
 
68
  cols = [c for c in tickers if c in df.columns]
69
- return df[cols].dropna(how="all").fillna(method="ffill")
 
70
 
71
  def monthly_returns(prices: pd.DataFrame) -> pd.DataFrame:
72
  return prices.pct_change().dropna()
73
 
74
  def yahoo_search(query: str):
75
- if not query or not str(query).strip(): return []
 
76
  url = "https://query1.finance.yahoo.com/v1/finance/search"
77
  params = {"q": query.strip(), "quotesCount": 10, "newsCount": 0}
78
  headers = {"User-Agent": "Mozilla/5.0"}
79
  try:
80
- r = requests.get(url, params=params, headers=headers, timeout=10); r.raise_for_status()
81
- data = r.json(); out = []
 
 
82
  for q in data.get("quotes", []):
83
- sym = q.get("symbol"); name = q.get("shortname") or q.get("longname") or ""; exch = q.get("exchDisp") or ""
84
- if sym and sym.isascii(): out.append(f"{sym} | {name} | {exch}")
85
- if not out: out = [f"{query.strip().upper()} | typed symbol | n/a"]
 
 
 
 
86
  return out[:10]
87
  except Exception:
88
  return [f"{query.strip().upper()} | typed symbol | n/a"]
89
 
90
  def validate_tickers(symbols: List[str], years: int) -> List[str]:
91
  base = [s for s in dict.fromkeys([t.upper().strip() for t in symbols]) if s]
92
- px = fetch_prices_monthly(base + [MARKET_TICKER], years)
 
93
  ok = [s for s in base if s in px.columns]
94
- if MARKET_TICKER not in px.columns: return []
 
 
95
  return ok
96
 
97
- # ---------- aligned moments & covariances (incl. market) ----------
98
  def get_aligned_monthly_returns(symbols: List[str], years: int) -> pd.DataFrame:
99
- uniq = [c for c in dict.fromkeys(symbols)]
100
- if MARKET_TICKER not in uniq: uniq.append(MARKET_TICKER)
101
- px = fetch_prices_monthly(uniq, years)
102
  rets = monthly_returns(px)
103
- cols = [c for c in uniq if c in rets.columns]
104
  R = rets[cols].dropna(how="any")
105
  return R.loc[:, ~R.columns.duplicated()]
106
 
107
  def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
108
  R = get_aligned_monthly_returns(symbols, years)
109
- if MARKET_TICKER not in R.columns or len(R) < 3:
110
  raise ValueError("Not enough aligned data with market proxy.")
111
  rf_m = rf_ann / 12.0
112
 
113
- m = R[MARKET_TICKER]
114
- if isinstance(m, pd.DataFrame): m = m.iloc[:, 0].squeeze()
 
 
115
  mu_m_ann = float(m.mean() * 12.0)
116
  sigma_m_ann = float(m.std(ddof=1) * math.sqrt(12.0))
117
  erp_ann = float(mu_m_ann - rf_ann)
118
 
119
  ex_m = m - rf_m
120
- var_m = float(np.var(ex_m.values, ddof=1)); var_m = max(var_m, 1e-9)
 
121
 
122
  betas: Dict[str, float] = {}
123
- for s in [c for c in R.columns if c != MARKET_TICKER]:
124
  ex_s = R[s] - rf_m
125
  cov_sm = float(np.cov(ex_s.values, ex_m.values, ddof=1)[0, 1])
126
  betas[s] = cov_sm / var_m
127
- betas[MARKET_TICKER] = 1.0
128
 
129
- # Full covariance including MARKET_TICKER
130
- cov_all_ann = pd.DataFrame(np.cov(R.values.T, ddof=1) * 12.0,
131
- index=R.columns, columns=R.columns)
132
 
133
- return {"betas": betas, "cov_all_ann": cov_all_ann, "erp_ann": erp_ann, "sigma_m_ann": sigma_m_ann}
134
 
135
  def capm_er(beta: float, rf_ann: float, erp_ann: float) -> float:
136
  return float(rf_ann + beta * erp_ann)
137
 
138
  def portfolio_stats(weights: Dict[str, float],
139
- cov_all_ann: pd.DataFrame,
140
  betas: Dict[str, float],
141
  rf_ann: float,
142
  erp_ann: float) -> Tuple[float, float, float]:
143
  tickers = list(weights.keys())
144
  w = np.array([weights[t] for t in tickers], dtype=float)
145
  gross = float(np.sum(np.abs(w)))
146
- if gross <= 1e-12: return 0.0, rf_ann, 0.0
 
147
  w_expo = w / gross
148
  beta_p = float(np.dot([betas.get(t, 0.0) for t in tickers], w_expo))
149
  mu_capm = capm_er(beta_p, rf_ann, erp_ann)
150
- cov = cov_all_ann.reindex(index=tickers, columns=tickers).fillna(0.0).to_numpy()
151
  sigma_hist = float(max(w_expo.T @ cov @ w_expo, 0.0)) ** 0.5
152
  return beta_p, mu_capm, sigma_hist
153
 
154
  def efficient_same_sigma(sigma_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
155
- if sigma_mkt <= 1e-12: return 0.0, 1.0, rf_ann
 
156
  a = sigma_target / sigma_mkt
157
  return a, 1.0 - a, rf_ann + a * erp_ann
158
 
159
  def efficient_same_return(mu_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
160
- if abs(erp_ann) <= 1e-12: return 0.0, 1.0, rf_ann
 
161
  a = (mu_target - rf_ann) / erp_ann
162
  return a, 1.0 - a, abs(a) * sigma_mkt
163
 
164
- # -------------- plotting --------------
165
  def _pct(x): return np.asarray(x, dtype=float) * 100.0
166
 
167
- def plot_cml_hybrid(rf_ann, erp_ann, sigma_mkt,
168
- sigma_hist_port, mu_capm_port,
169
- mu_eff_same_sigma, sigma_eff_same_return,
170
- sugg_mu=None, sugg_sigma_hist=None) -> Image.Image:
171
- fig = plt.figure(figsize=(6.5, 4.2), dpi=120)
172
- xmax = max(0.3, sigma_mkt * 2.2, (sigma_hist_port or 0.0) * 1.6,
173
- (sigma_eff_same_return or 0.0) * 1.6, (sugg_sigma_hist or 0.0) * 1.6)
174
- xs = np.linspace(0.0, xmax, 240)
175
- cml = rf_ann + (erp_ann / max(sigma_mkt, 1e-9)) * xs if sigma_mkt > 1e-12 else np.full_like(xs, rf_ann)
 
 
 
 
 
 
 
 
176
  plt.plot(_pct(xs), _pct(cml), label="CML (Market/Bills)", linewidth=1.8)
177
- plt.scatter([_pct(0)], [_pct(rf_ann)], label="Risk-free", zorder=3)
178
- plt.scatter([_pct(sigma_mkt)], [_pct(rf_ann + erp_ann)], label="Market", zorder=3)
179
- plt.scatter([_pct(sigma_hist_port)], [_pct(mu_capm_port)], label="Your CAPM point", marker="o", zorder=4)
180
- plt.scatter([_pct(sigma_hist_port)], [_pct(mu_eff_same_sigma)], label="Efficient (same Οƒ)", marker="^", zorder=4)
181
- plt.scatter([_pct(sigma_eff_same_return)], [_pct(mu_capm_port)], label="Efficient (same E[r])", marker="s", zorder=4)
182
- if (sugg_mu is not None) and (sugg_sigma_hist is not None):
183
- plt.scatter([_pct(sugg_sigma_hist)], [_pct(sugg_mu)], label="Selected Suggestion", marker="X", s=70, zorder=5)
184
- plt.xlabel("Οƒ (historical, annualized, %)"); plt.ylabel("CAPM E[r] (annual, %)")
185
- plt.legend(loc="best", fontsize=8); plt.tight_layout()
186
- buf = io.BytesIO(); plt.savefig(buf, format="png"); plt.close(fig); buf.seek(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  return Image.open(buf)
188
 
189
- # -------------- synthetic dataset (re-weights of SAME tickers) --------------
190
- def build_synthetic_dataset_same_tickers(tickers: List[str],
191
- cov_all_ann: pd.DataFrame,
192
- betas: Dict[str, float],
193
- rf_ann: float,
194
- erp_ann: float,
195
- n_rows: int = SYNTH_ROWS) -> pd.DataFrame:
196
- """Generate long-only Dirichlet weights over EXACTLY the user's tickers."""
197
  rng = np.random.default_rng(12345)
198
- picks = [t for t in tickers] # fixed set
199
- k = len(picks)
 
 
200
  rows = []
201
  for _ in range(n_rows):
202
- w = rng.dirichlet(np.ones(k))
 
 
203
  beta_p = float(np.dot([betas.get(t, 0.0) for t in picks], w))
204
  mu_capm = capm_er(beta_p, rf_ann, erp_ann)
205
- sub_cov = cov_all_ann.reindex(index=picks, columns=picks).fillna(0.0).to_numpy()
206
- sigma_hist = float(max(w.T @ sub_cov @ w, 0.0)) ** 0.5
 
207
  rows.append({
208
  "tickers": ",".join(picks),
209
- "weights": ",".join(f"{x:.6f}" for x in w),
210
  "beta": beta_p,
211
  "mu_capm": mu_capm,
212
  "sigma_hist": sigma_hist
213
  })
214
  return pd.DataFrame(rows)
215
 
 
216
  def _band_bounds_sigma_hist(sigma_mkt: float, band: str) -> Tuple[float, float]:
217
  band = (band or "Medium").strip().lower()
218
- if band.startswith("low"): return 0.0, 0.8 * sigma_mkt
219
- if band.startswith("high"): return 1.2 * sigma_mkt, 3.0 * sigma_mkt
 
 
220
  return 0.8 * sigma_mkt, 1.2 * sigma_mkt
221
 
222
- # -------------- embeddings & scoring --------------
223
- _EMBED_MODEL = None
224
- _TICKER_EMBED_CACHE: Dict[str, np.ndarray] = {}
225
-
226
- def _load_embed_model():
227
- global _EMBED_MODEL
228
- if _EMBED_MODEL is not None: return _EMBED_MODEL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  try:
230
  from sentence_transformers import SentenceTransformer
231
- _EMBED_MODEL = SentenceTransformer(EMBED_MODEL_NAME)
232
  except Exception:
233
- _EMBED_MODEL = None
234
- return _EMBED_MODEL
235
-
236
- def _embed_texts(texts: List[str]) -> np.ndarray:
237
- model = _load_embed_model()
238
- if model is None: return np.zeros((len(texts), 384), dtype=float)
239
- return np.array(model.encode(texts), dtype=float)
240
 
241
- def _ticker_vec(t: str) -> np.ndarray:
242
- t = t.upper().strip()
243
- if t in _TICKER_EMBED_CACHE: return _TICKER_EMBED_CACHE[t]
244
- v = _embed_texts([f"ticker {t}"])[0]; _TICKER_EMBED_CACHE[t] = v; return v
 
 
 
245
 
246
  def _portfolio_embedding(tickers: List[str], weights: List[float]) -> np.ndarray:
247
- if not tickers: return np.zeros(384, dtype=float)
248
- w = np.array(weights, dtype=float); s = float(np.sum(np.abs(w)))
249
- w = (np.ones(len(tickers))/len(tickers)) if s<=1e-12 else (w/s)
250
- vs = np.stack([_ticker_vec(t) for t in tickers], axis=0)
251
- v = (w[:,None]*vs).sum(axis=0); n = float(np.linalg.norm(v))
252
- return v/(n if n>1e-12 else 1.0)
 
 
 
253
 
254
  def _cos_sim(a: np.ndarray, b: np.ndarray) -> float:
255
- na = float(np.linalg.norm(a)); nb = float(np.linalg.norm(b))
256
- if na<=1e-12 or nb<=1e-12: return 0.0
257
- return float(np.dot(a,b)/(na*nb))
258
-
259
- def _exposure_similarity(user_map: Dict[str,float], cand_map: Dict[str,float]) -> float:
260
- s_user = sum(abs(x) for x in user_map.values()); s_c = sum(abs(x) for x in cand_map.values())
261
- if s_user<=1e-12 or s_c<=1e-12: return 0.0
262
- u = {k:abs(v)/s_user for k,v in user_map.items()}
263
- c = {k:abs(v)/s_c for k,v in cand_map.items()}
264
- common = set(u)&set(c); return float(sum(min(u[t],c[t]) for t in common))
265
-
266
- def pick_best_in_band(user_df: pd.DataFrame, band_df: pd.DataFrame,
267
- alpha: float = EMBED_ALPHA, top_N: int = 50) -> pd.Series:
268
- """
269
- Score candidates by alpha*exposure-sim + (1-alpha)*embedding-sim, among top_N by CAPM return.
270
- Return the single best row.
271
- """
272
- if band_df.empty: return pd.Series(dtype="float64")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  try:
274
- # restrict to strong candidates by return first
275
  band_df = band_df.sort_values("mu_capm", ascending=False).head(top_N).reset_index(drop=True)
276
 
277
  u_t = user_df["ticker"].astype(str).str.upper().tolist()
@@ -283,27 +403,36 @@ def pick_best_in_band(user_df: pd.DataFrame, band_df: pd.DataFrame,
283
  for _, r in band_df.iterrows():
284
  ts = [t.strip().upper() for t in str(r["tickers"]).split(",")]
285
  ws = [float(x) for x in str(r["weights"]).split(",")]
286
- s = sum(max(0.0,w) for w in ws) or 1.0
287
- ws = [max(0.0,w)/s for w in ws]
288
- c_map = {t:w for t,w in zip(ts,ws)}
289
  c_embed = _portfolio_embedding(ts, ws)
290
  expo_sim = _exposure_similarity(u_map, c_map)
291
  emb_sim = _cos_sim(u_embed, c_embed)
292
- scores.append(alpha*expo_sim + (1.0-alpha)*emb_sim)
293
 
294
- i_best = int(np.argmax(scores))
295
- return band_df.iloc[i_best]
 
 
296
  except Exception:
297
  return band_df.iloc[0]
298
 
299
  # -------------- UI helpers --------------
300
- def empty_positions_df(): return pd.DataFrame(columns=["ticker","amount_usd","weight_exposure","beta"])
301
- def empty_holdings_df(): return pd.DataFrame(columns=["ticker","weight_%","amount_$"])
 
 
 
302
 
303
  def set_horizon(years: float):
304
- y = max(1.0, min(100.0, float(years))); code = fred_series_for_horizon(y); rf = fetch_fred_yield_annual(code)
 
 
305
  global HORIZON_YEARS, RF_CODE, RF_ANN
306
- HORIZON_YEARS, RF_CODE, RF_ANN = y, code, rf
 
 
307
  return f"Risk-free series {code}. Latest annual rate {rf:.2%}."
308
 
309
  def search_tickers_cb(q: str):
@@ -313,50 +442,52 @@ def search_tickers_cb(q: str):
313
 
314
  def add_symbol(selection: str, table: Optional[pd.DataFrame]):
315
  if not selection:
316
- return table if isinstance(table,pd.DataFrame) else pd.DataFrame(columns=["ticker","amount_usd"]), "Pick a row in Matches first."
317
  symbol = selection.split("|")[0].strip().upper()
 
318
  current = []
319
- if isinstance(table,pd.DataFrame) and not table.empty:
320
  current = [str(x).upper() for x in table["ticker"].tolist() if str(x) != "nan"]
321
  tickers = current if symbol in current else current + [symbol]
 
 
322
  val = validate_tickers(tickers, years=DEFAULT_LOOKBACK_YEARS)
323
  tickers = [t for t in tickers if t in val]
 
324
  amt_map = {}
325
- if isinstance(table,pd.DataFrame) and not table.empty:
326
  for _, r in table.iterrows():
327
- t = str(r.get("ticker","")).upper()
328
  if t in tickers:
329
- amt_map[t] = float(pd.to_numeric(r.get("amount_usd",0.0), errors="coerce") or 0.0)
330
- new_table = pd.DataFrame({"ticker": tickers, "amount_usd": [amt_map.get(t,0.0) for t in tickers]})
 
331
  if len(new_table) > MAX_TICKERS:
332
- new_table = new_table.iloc[:MAX_TICKERS]; return new_table, f"Reached max of {MAX_TICKERS}."
 
333
  return new_table, f"Added {symbol}."
334
 
335
  def lock_ticker_column(tb: Optional[pd.DataFrame]):
336
- if not isinstance(tb,pd.DataFrame) or tb.empty:
337
- return pd.DataFrame(columns=["ticker","amount_usd"])
338
  tickers = [str(x).upper() for x in tb["ticker"].tolist()]
339
  amounts = pd.to_numeric(tb["amount_usd"], errors="coerce").fillna(0.0).tolist()
340
  val = validate_tickers(tickers, years=DEFAULT_LOOKBACK_YEARS)
341
  tickers = [t for t in tickers if t in val]
342
- amounts = amounts[:len(tickers)] + [0.0]*max(0, len(tickers)-len(amounts))
343
  return pd.DataFrame({"ticker": tickers, "amount_usd": amounts})
344
 
345
- # -------------- compute core --------------
346
- UNIVERSE: List[str] = [MARKET_TICKER, "QQQ", "VTI", "SOXX", "IBIT"]
347
-
348
- def _row_to_holdings(row: pd.Series, budget: float) -> pd.DataFrame:
349
- if row is None or row.empty: return empty_holdings_df()
350
- ts = [t.strip().upper() for t in str(row["tickers"]).split(",")]
351
- ws = [float(x) for x in str(row["weights"]).split(",")]
352
- s = sum(max(0.0,w) for w in ws) or 1.0
353
- ws = [max(0.0,w)/s for w in ws]
354
- return pd.DataFrame([{"ticker": t, "weight_%": round(w*100,2), "amount_$": round(w*budget,0)} for t,w in zip(ts,ws)],
355
- columns=["ticker","weight_%","amount_$"])
356
-
357
- def compute_all(years_lookback: int, table: Optional[pd.DataFrame]):
358
- # sanitize input table
359
- df = table.copy() if isinstance(table,pd.DataFrame) else pd.DataFrame(columns=["ticker","amount_usd"])
360
  df = df.dropna(how="all")
361
  if "ticker" not in df.columns: df["ticker"] = []
362
  if "amount_usd" not in df.columns: df["amount_usd"] = []
@@ -364,54 +495,63 @@ def compute_all(years_lookback: int, table: Optional[pd.DataFrame]):
364
  df["amount_usd"] = pd.to_numeric(df["amount_usd"], errors="coerce").fillna(0.0)
365
 
366
  symbols = [t for t in df["ticker"].tolist() if t]
367
- if len(symbols)==0: raise gr.Error("Add at least one ticker.")
368
- symbols = validate_tickers(symbols, years_lookback)
369
- if len(symbols)==0: raise gr.Error("Could not validate any tickers.")
370
 
371
- global UNIVERSE
372
- # Universe is your exact tickers (for suggestions we re-weight SAME tickers)
373
- UNIVERSE = list(sorted(set(symbols)))[:MAX_TICKERS]
374
 
375
- df = df[df["ticker"].isin(symbols)].copy()
376
- amounts = {r["ticker"]: float(r["amount_usd"]) for _, r in df.iterrows()}
377
  rf_ann = RF_ANN
378
 
 
379
  moms = estimate_all_moments_aligned(symbols, years_lookback, rf_ann)
380
- betas, cov_all_ann, erp_ann, sigma_mkt = moms["betas"], moms["cov_all_ann"], moms["erp_ann"], moms["sigma_m_ann"]
381
 
 
382
  gross = sum(abs(v) for v in amounts.values())
383
- if gross <= 1e-12: raise gr.Error("All amounts are zero.")
384
- weights = {k: v/gross for k,v in amounts.items()}
 
385
 
386
- beta_p, mu_capm, sigma_hist = portfolio_stats(weights, cov_all_ann, betas, rf_ann, erp_ann)
 
387
 
 
388
  a_sigma, b_sigma, mu_eff_sigma = efficient_same_sigma(sigma_hist, rf_ann, erp_ann, sigma_mkt)
389
  a_mu, b_mu, sigma_eff_mu = efficient_same_return(mu_capm, rf_ann, erp_ann, sigma_mkt)
390
 
391
- # dataset: re-weight EXACT same tickers you entered
392
- synth = build_synthetic_dataset_same_tickers(UNIVERSE, cov_all_ann, betas, rf_ann, erp_ann, n_rows=SYNTH_ROWS)
393
  csv_path = os.path.join(DATA_DIR, f"investor_profiles_{int(time.time())}.csv")
394
- try: synth.to_csv(csv_path, index=False)
395
- except Exception: csv_path = None
396
-
397
- # one suggestion per band (best by embedding/exposure score among candidates in band)
398
- def best_for_band(band: str) -> pd.Series:
399
- lo, hi = _band_bounds_sigma_hist(sigma_mkt, band)
400
- band_df = synth[(synth["sigma_hist"]>=lo) & (synth["sigma_hist"]<=hi)].copy()
401
- if band_df.empty: band_df = synth.copy()
402
- user_df = pd.DataFrame({"ticker": list(weights.keys()), "amount_usd": [amounts[t] for t in weights.keys()]})
403
- return pick_best_in_band(user_df, band_df, EMBED_ALPHA, top_N=50)
404
-
405
- best_low = best_for_band("Low")
406
- best_med = best_for_band("Medium")
407
- best_high = best_for_band("High")
408
-
409
- # derived displays
410
- pos_table = pd.DataFrame([{
411
- "ticker": t, "amount_usd": amounts.get(t,0.0),
412
- "weight_exposure": weights.get(t,0.0),
413
- "beta": betas.get(t, np.nan) if t != MARKET_TICKER else 1.0
414
- } for t in symbols], columns=["ticker","amount_usd","weight_exposure","beta"])
 
 
 
 
 
 
415
 
416
  info = "\n".join([
417
  "### Inputs",
@@ -430,85 +570,155 @@ def compute_all(years_lookback: int, table: Optional[pd.DataFrame]):
430
  f"- Same Οƒ as your portfolio: Market {a_sigma:.2f}, Bills {b_sigma:.2f} β†’ E[r] {mu_eff_sigma:.2%}",
431
  f"- Same E[r] as your portfolio: Market {a_mu:.2f}, Bills {b_mu:.2f} β†’ Οƒ {sigma_eff_mu:.2%}",
432
  "",
433
- "_Suggestions are single picks per band, re-weighting **the same tickers you entered**, and are chosen via embeddings + exposure similarity._",
434
- "_All points are guaranteed on/under the CML because Οƒ uses the full covariance (incl. market)._"
435
  ])
436
 
437
- uni_msg = f"Universe set to: {', '.join(UNIVERSE)}"
438
- return dict(rf_ann=rf_ann, erp_ann=erp_ann, sigma_mkt=sigma_mkt,
439
- mu_capm=mu_capm, sigma_hist=sigma_hist,
440
- mu_eff_same_sigma=mu_eff_sigma, sigma_eff_same_return=sigma_eff_mu,
441
- pos_table=pos_table, info=info, uni_msg=uni_msg, csv_path=csv_path,
442
- best_low=best_low, best_med=best_med, best_high=best_high,
443
- budget=gross)
444
-
445
- def render_with_band(years_lookback: int, table: Optional[pd.DataFrame], which_band: str):
446
- outs = compute_all(years_lookback, table)
447
- # pick which suggestion to highlight
448
- row = outs["best_med"]
449
- if (which_band or "").lower().startswith("low"): row = outs["best_low"]
450
- if (which_band or "").lower().startswith("high"): row = outs["best_high"]
451
-
452
- sugg_mu = None; sugg_sigma_hist = None; holdings = empty_holdings_df()
453
- if isinstance(row, pd.Series) and not row.empty:
454
- sugg_mu = float(row["mu_capm"]); sugg_sigma_hist = float(row["sigma_hist"])
455
- holdings = _row_to_holdings(row, outs["budget"])
456
-
457
- img = plot_cml_hybrid(
458
- outs["rf_ann"], outs["erp_ann"], outs["sigma_mkt"],
459
- outs["sigma_hist"], outs["mu_capm"],
460
- outs["mu_eff_same_sigma"], outs["sigma_eff_same_return"],
461
- sugg_mu, sugg_sigma_hist
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
  )
463
 
464
- # small stats for each band (single pick)
465
- def _band_stats(s: pd.Series) -> str:
466
- if s is None or s.empty: return "β€”"
467
- return f"CAPM E[r] {float(s['mu_capm'])*100:.2f}%, Οƒ(h) {float(s['sigma_hist'])*100:.2f}%"
468
 
469
- low_stats = _band_stats(outs["best_low"])
470
- med_stats = _band_stats(outs["best_med"])
471
- high_stats = _band_stats(outs["best_high"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
 
473
- return img, outs["info"], outs["uni_msg"], outs["pos_table"], holdings, outs["csv_path"], low_stats, med_stats, high_stats
474
 
475
  # -------------- UI --------------
476
  with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
477
  gr.Markdown(
478
  "## Efficient Portfolio Advisor\n"
479
- "Plot uses **x = historical Οƒ** and **y = CAPM E[r] = rf + Ξ²Β·ERP**. "
480
- "Efficient (same Οƒ) and (same E[r]) market/bills points are shown.\n\n"
481
- "**Suggestions:** We re-weight the **same tickers you entered** to produce one Low/Medium/High pick. "
482
- "Embeddings + exposure similarity select the best pick in each band."
483
  )
 
 
 
484
  with gr.Row():
485
  with gr.Column(scale=1):
486
- q = gr.Textbox(label="Search symbol"); search_note = gr.Markdown()
 
487
  matches = gr.Dropdown(choices=[], label="Matches")
488
- with gr.Row():
489
- search_btn = gr.Button("Search"); add_btn = gr.Button("Add selected to portfolio")
 
490
  gr.Markdown("### Portfolio positions (enter $ amounts; negatives allowed)")
491
- table = gr.Dataframe(value=pd.DataFrame(columns=["ticker","amount_usd"]), interactive=True)
 
 
 
 
 
 
492
  horizon = gr.Number(label="Horizon in years (1–100)", value=HORIZON_YEARS, precision=0)
493
- lookback = gr.Slider(1, 15, value=DEFAULT_LOOKBACK_YEARS, step=1, label="Lookback years")
494
 
495
  gr.Markdown("### Suggestions (one per band)")
496
  with gr.Row():
497
- low_btn = gr.Button("Show Low")
498
- med_btn = gr.Button("Show Medium")
499
- high_btn = gr.Button("Show High")
500
- low_txt = gr.Markdown("Low: β€”")
501
- med_txt = gr.Markdown("Medium: β€”")
502
- high_txt = gr.Markdown("High: β€”")
 
503
 
504
  run_btn = gr.Button("Compute (build dataset & suggest)")
505
  with gr.Column(scale=1):
506
  plot = gr.Image(label="Capital Market Line (CAPM)", type="pil")
507
  summary = gr.Markdown(label="Inputs & Results")
508
  universe_msg = gr.Textbox(label="Universe status", interactive=False)
509
- positions = gr.Dataframe(value=empty_positions_df(), interactive=False, label="Computed positions")
510
- selected_table = gr.Dataframe(value=empty_holdings_df(), interactive=False,
511
- label="Selected suggestion holdings (% / $)")
 
 
 
 
 
 
 
 
 
 
 
 
 
512
  dl = gr.File(label="Generated dataset CSV", value=None, visible=True)
513
 
514
  # wire search / add / locking / horizon
@@ -517,34 +727,41 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
517
  table.change(fn=lock_ticker_column, inputs=table, outputs=table)
518
  horizon.change(fn=set_horizon, inputs=horizon, outputs=universe_msg)
519
 
520
- # main compute (defaults to Medium highlighted)
 
 
 
 
 
 
 
 
 
 
 
521
  run_btn.click(
522
- fn=render_with_band,
523
- inputs=[lookback, table, gr.State("Medium")],
524
- outputs=[plot, summary, universe_msg, positions, selected_table, dl, low_txt, med_txt, high_txt]
525
  )
526
 
527
- # choose band with buttons
528
- low_btn.click(
529
- fn=render_with_band,
530
- inputs=[lookback, table, gr.State("Low")],
531
- outputs=[plot, summary, universe_msg, positions, selected_table, dl, low_txt, med_txt, high_txt]
532
- )
533
- med_btn.click(
534
- fn=render_with_band,
535
- inputs=[lookback, table, gr.State("Medium")],
536
- outputs=[plot, summary, universe_msg, positions, selected_table, dl, low_txt, med_txt, high_txt]
537
- )
538
- high_btn.click(
539
- fn=render_with_band,
540
- inputs=[lookback, table, gr.State("High")],
541
- outputs=[plot, summary, universe_msg, positions, selected_table, dl, low_txt, med_txt, high_txt]
542
- )
543
 
544
  # initialize risk-free at launch
545
  RF_CODE = fred_series_for_horizon(HORIZON_YEARS)
546
  RF_ANN = fetch_fred_yield_annual(RF_CODE)
547
 
548
  if __name__ == "__main__":
549
- demo.queue() # no concurrency_count to keep compatibility with older Gradio
550
- demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)), show_api=False, share=False)
 
1
+ # app.py
2
+ import os, io, math, time, warnings, json, random
3
  warnings.filterwarnings("ignore")
4
 
5
  from typing import List, Tuple, Dict, Optional
 
18
 
19
  MAX_TICKERS = 30
20
  DEFAULT_LOOKBACK_YEARS = 10
 
21
 
22
+ # Market proxy used only for CAPM/CML math. We never add it to your portfolio or suggestions.
23
+ MARKET_PROXY = "VOO"
 
 
24
 
25
+ SYNTH_ROWS = 1000 # size of generated dataset for suggestions
26
+ EMBED_ALPHA = 0.6 # exposure-sim weight in score (1-alpha uses text embeddings)
27
+ MMR_LAMBDA = 0.7 # diversity for MMR (higher favors quality over diversity)
28
+
29
+ # Globals that update with horizon changes
30
  HORIZON_YEARS = 10
31
  RF_CODE = "DGS10"
32
+ RF_ANN = 0.0375 # updated at launch
33
 
34
  # ---------------- helpers ----------------
35
  def fred_series_for_horizon(years: float) -> str:
 
45
  def fetch_fred_yield_annual(code: str) -> float:
46
  url = f"https://fred.stlouisfed.org/graph/fredgraph.csv?id={code}"
47
  try:
48
+ r = requests.get(url, timeout=10)
49
+ r.raise_for_status()
50
  df = pd.read_csv(io.StringIO(r.text))
51
  s = pd.to_numeric(df.iloc[:, 1], errors="coerce").dropna()
52
  return float(s.iloc[-1] / 100.0) if len(s) else 0.03
 
54
  return 0.03
55
 
56
  def fetch_prices_monthly(tickers: List[str], years: int) -> pd.DataFrame:
57
+ tickers = list(dict.fromkeys([t.upper().strip() for t in tickers]))
58
+ start = (pd.Timestamp.today(tz="UTC") - pd.DateOffset(years=years, days=7)).date()
59
  end = pd.Timestamp.today(tz="UTC").date()
60
+
61
  df = yf.download(
62
+ tickers,
63
+ start=start,
64
+ end=end,
65
+ interval="1mo",
66
+ auto_adjust=True,
67
+ actions=False,
68
+ progress=False,
69
+ group_by="column",
70
+ threads=False,
71
  )
72
+
73
+ if isinstance(df, pd.Series):
74
+ df = df.to_frame()
75
  if isinstance(df.columns, pd.MultiIndex):
76
  lvl0 = [str(x) for x in df.columns.get_level_values(0).unique()]
77
+ if "Close" in lvl0:
78
+ df = df["Close"]
79
+ elif "Adj Close" in lvl0:
80
+ df = df["Adj Close"]
81
+ else:
82
+ df = df.xs(df.columns.levels[0][-1], axis=1, level=0, drop_level=True)
83
+
84
  cols = [c for c in tickers if c in df.columns]
85
+ out = df[cols].dropna(how="all").fillna(method="ffill")
86
+ return out
87
 
88
  def monthly_returns(prices: pd.DataFrame) -> pd.DataFrame:
89
  return prices.pct_change().dropna()
90
 
91
  def yahoo_search(query: str):
92
+ if not query or not str(query).strip():
93
+ return []
94
  url = "https://query1.finance.yahoo.com/v1/finance/search"
95
  params = {"q": query.strip(), "quotesCount": 10, "newsCount": 0}
96
  headers = {"User-Agent": "Mozilla/5.0"}
97
  try:
98
+ r = requests.get(url, params=params, headers=headers, timeout=10)
99
+ r.raise_for_status()
100
+ data = r.json()
101
+ out = []
102
  for q in data.get("quotes", []):
103
+ sym = q.get("symbol")
104
+ name = q.get("shortname") or q.get("longname") or ""
105
+ exch = q.get("exchDisp") or ""
106
+ if sym and sym.isascii():
107
+ out.append(f"{sym} | {name} | {exch}")
108
+ if not out:
109
+ out = [f"{query.strip().upper()} | typed symbol | n/a"]
110
  return out[:10]
111
  except Exception:
112
  return [f"{query.strip().upper()} | typed symbol | n/a"]
113
 
114
  def validate_tickers(symbols: List[str], years: int) -> List[str]:
115
  base = [s for s in dict.fromkeys([t.upper().strip() for t in symbols]) if s]
116
+ # We fetch base + MARKET_PROXY only to compute CAPM, but we don't add MARKET_PROXY to suggestions.
117
+ px = fetch_prices_monthly(base + [MARKET_PROXY], years)
118
  ok = [s for s in base if s in px.columns]
119
+ # Need market proxy data available; otherwise we cannot compute Ξ²/ERP/CML.
120
+ if MARKET_PROXY not in px.columns:
121
+ return [] # cannot proceed without market series
122
  return ok
123
 
124
+ # -------------- aligned moments (vs market proxy) --------------
125
  def get_aligned_monthly_returns(symbols: List[str], years: int) -> pd.DataFrame:
126
+ uniq = [c for c in dict.fromkeys(symbols) if c != MARKET_PROXY]
127
+ tickers = uniq + [MARKET_PROXY]
128
+ px = fetch_prices_monthly(tickers, years)
129
  rets = monthly_returns(px)
130
+ cols = [c for c in uniq if c in rets.columns] + ([MARKET_PROXY] if MARKET_PROXY in rets.columns else [])
131
  R = rets[cols].dropna(how="any")
132
  return R.loc[:, ~R.columns.duplicated()]
133
 
134
  def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
135
  R = get_aligned_monthly_returns(symbols, years)
136
+ if MARKET_PROXY not in R.columns or len(R) < 3:
137
  raise ValueError("Not enough aligned data with market proxy.")
138
  rf_m = rf_ann / 12.0
139
 
140
+ m = R[MARKET_PROXY]
141
+ if isinstance(m, pd.DataFrame):
142
+ m = m.iloc[:, 0].squeeze()
143
+
144
  mu_m_ann = float(m.mean() * 12.0)
145
  sigma_m_ann = float(m.std(ddof=1) * math.sqrt(12.0))
146
  erp_ann = float(mu_m_ann - rf_ann)
147
 
148
  ex_m = m - rf_m
149
+ var_m = float(np.var(ex_m.values, ddof=1))
150
+ var_m = max(var_m, 1e-9)
151
 
152
  betas: Dict[str, float] = {}
153
+ for s in [c for c in R.columns if c != MARKET_PROXY]:
154
  ex_s = R[s] - rf_m
155
  cov_sm = float(np.cov(ex_s.values, ex_m.values, ddof=1)[0, 1])
156
  betas[s] = cov_sm / var_m
 
157
 
158
+ asset_cols = [c for c in R.columns if c != MARKET_PROXY]
159
+ cov_m = np.cov(R[asset_cols].values.T, ddof=1) if asset_cols else np.zeros((0, 0))
160
+ covA = pd.DataFrame(cov_m * 12.0, index=asset_cols, columns=asset_cols)
161
 
162
+ return {"betas": betas, "cov_ann": covA, "erp_ann": erp_ann, "sigma_m_ann": sigma_m_ann}
163
 
164
  def capm_er(beta: float, rf_ann: float, erp_ann: float) -> float:
165
  return float(rf_ann + beta * erp_ann)
166
 
167
  def portfolio_stats(weights: Dict[str, float],
168
+ cov_ann: pd.DataFrame,
169
  betas: Dict[str, float],
170
  rf_ann: float,
171
  erp_ann: float) -> Tuple[float, float, float]:
172
  tickers = list(weights.keys())
173
  w = np.array([weights[t] for t in tickers], dtype=float)
174
  gross = float(np.sum(np.abs(w)))
175
+ if gross <= 1e-12:
176
+ return 0.0, rf_ann, 0.0
177
  w_expo = w / gross
178
  beta_p = float(np.dot([betas.get(t, 0.0) for t in tickers], w_expo))
179
  mu_capm = capm_er(beta_p, rf_ann, erp_ann)
180
+ cov = cov_ann.reindex(index=tickers, columns=tickers).fillna(0.0).to_numpy()
181
  sigma_hist = float(max(w_expo.T @ cov @ w_expo, 0.0)) ** 0.5
182
  return beta_p, mu_capm, sigma_hist
183
 
184
  def efficient_same_sigma(sigma_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
185
+ if sigma_mkt <= 1e-12:
186
+ return 0.0, 1.0, rf_ann
187
  a = sigma_target / sigma_mkt
188
  return a, 1.0 - a, rf_ann + a * erp_ann
189
 
190
  def efficient_same_return(mu_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
191
+ if abs(erp_ann) <= 1e-12:
192
+ return 0.0, 1.0, rf_ann
193
  a = (mu_target - rf_ann) / erp_ann
194
  return a, 1.0 - a, abs(a) * sigma_mkt
195
 
196
+ # -------------- plotting (CAPM on CML) --------------
197
  def _pct(x): return np.asarray(x, dtype=float) * 100.0
198
 
199
+ def _clamp_to_cml_y(mu_capm, sigma_hist, rf_ann, erp_ann, sigma_mkt):
200
+ # Return y that never exceeds CML at given (historical) sigma
201
+ slope = erp_ann / max(sigma_mkt, 1e-12)
202
+ y_cml = rf_ann + slope * max(0.0, float(sigma_hist))
203
+ return float(min(mu_capm, y_cml))
204
+
205
+ def plot_cml(rf_ann, erp_ann, sigma_mkt,
206
+ sigma_hist_p, mu_capm_p,
207
+ same_sigma_mu, same_mu_sigma,
208
+ sugg_sigma_hist=None, sugg_mu_capm=None) -> Image.Image:
209
+
210
+ fig = plt.figure(figsize=(6.5, 4.3), dpi=120)
211
+
212
+ xmax = max(0.3, sigma_mkt * 2.4, (sigma_hist_p or 0.0) * 1.6, (sugg_sigma_hist or 0.0) * 1.6)
213
+ xs = np.linspace(0, xmax, 200)
214
+ cml = rf_ann + (erp_ann / max(sigma_mkt, 1e-9)) * xs
215
+
216
  plt.plot(_pct(xs), _pct(cml), label="CML (Market/Bills)", linewidth=1.8)
217
+ plt.scatter([_pct(0)], [_pct(rf_ann)], label="Risk-free")
218
+ plt.scatter([_pct(sigma_mkt)], [_pct(rf_ann + erp_ann)], label="Market")
219
+
220
+ # Your CAPM point (y clamped under CML; x = historical Οƒ)
221
+ y_you = _clamp_to_cml_y(mu_capm_p, sigma_hist_p, rf_ann, erp_ann, sigma_mkt)
222
+ plt.scatter([_pct(sigma_hist_p)], [_pct(y_you)], label="Your CAPM point")
223
+
224
+ # Efficient points
225
+ plt.scatter([_pct(same_mu_sigma)], [_pct(same_sigma_mu)], label="Efficient (same Οƒ)", marker="^")
226
+ plt.scatter([_pct(same_mu_sigma)], [_pct(same_sigma_mu)], marker="^") # ensure visible
227
+
228
+ plt.scatter([_pct(same_mu_sigma)], [_pct(same_sigma_mu)], marker="^")
229
+
230
+ a_mu_sigma = same_mu_sigma
231
+ a_sigma_mu = same_sigma_mu
232
+ plt.scatter([_pct(a_mu_sigma)], [_pct(a_sigma_mu)], marker="^", label="Efficient (same E[r])")
233
+
234
+ # Selected suggestion (if any)
235
+ if sugg_sigma_hist is not None and sugg_mu_capm is not None:
236
+ y_s = _clamp_to_cml_y(sugg_mu_capm, sugg_sigma_hist, rf_ann, erp_ann, sigma_mkt)
237
+ plt.scatter([_pct(sugg_sigma_hist)], [_pct(y_s)], label="Selected Suggestion", marker="X", s=60)
238
+
239
+ plt.xlabel("Οƒ (historical, annualized, %)")
240
+ plt.ylabel("CAPM E[r] (annual, %)")
241
+ plt.legend(loc="best", fontsize=8)
242
+ plt.tight_layout()
243
+
244
+ buf = io.BytesIO()
245
+ plt.savefig(buf, format="png")
246
+ plt.close(fig)
247
+ buf.seek(0)
248
  return Image.open(buf)
249
 
250
+ # -------------- synthetic dataset (from current universe only) --------------
251
+ def build_synthetic_dataset(universe: List[str],
252
+ covA: pd.DataFrame,
253
+ betas: Dict[str, float],
254
+ rf_ann: float,
255
+ erp_ann: float,
256
+ sigma_mkt: float,
257
+ n_rows: int = SYNTH_ROWS) -> pd.DataFrame:
258
  rng = np.random.default_rng(12345)
259
+ assets = list(universe)
260
+ if not assets:
261
+ return pd.DataFrame(columns=["tickers","weights","beta","mu_capm","sigma_hist"])
262
+
263
  rows = []
264
  for _ in range(n_rows):
265
+ k = int(rng.integers(low=2, high=min(8, len(assets)) + 1))
266
+ picks = list(rng.choice(assets, size=k, replace=False))
267
+ w = rng.dirichlet(np.ones(k)) # long-only, sum=1
268
  beta_p = float(np.dot([betas.get(t, 0.0) for t in picks], w))
269
  mu_capm = capm_er(beta_p, rf_ann, erp_ann)
270
+ sub = covA.reindex(index=picks, columns=picks).fillna(0.0).to_numpy()
271
+ sigma_hist = float(max(w.T @ sub @ w, 0.0)) ** 0.5
272
+
273
  rows.append({
274
  "tickers": ",".join(picks),
275
+ "weights": ",".join(f"{x:.8f}" for x in w),
276
  "beta": beta_p,
277
  "mu_capm": mu_capm,
278
  "sigma_hist": sigma_hist
279
  })
280
  return pd.DataFrame(rows)
281
 
282
+ # ---- band helpers (by historical sigma) ----
283
  def _band_bounds_sigma_hist(sigma_mkt: float, band: str) -> Tuple[float, float]:
284
  band = (band or "Medium").strip().lower()
285
+ if band.startswith("low"):
286
+ return 0.0, 0.8 * sigma_mkt
287
+ if band.startswith("high"):
288
+ return 1.2 * sigma_mkt, 3.0 * sigma_mkt
289
  return 0.8 * sigma_mkt, 1.2 * sigma_mkt
290
 
291
+ def candidates_for_band(synth: pd.DataFrame, sigma_mkt: float, band: str):
292
+ """Return (band_df, used_fallback) ensuring Low/Medium/High are monotone in Οƒ."""
293
+ lo, hi = _band_bounds_sigma_hist(sigma_mkt, band)
294
+ band_df = synth[(synth["sigma_hist"] >= lo) & (synth["sigma_hist"] <= hi)]
295
+ if not band_df.empty:
296
+ return band_df.copy(), False
297
+
298
+ # widen gradually
299
+ widen = 0.15
300
+ for _ in range(4):
301
+ lo = max(0.0, lo * (1.0 - widen))
302
+ hi = hi * (1.0 + widen)
303
+ band_df = synth[(synth["sigma_hist"] >= lo) & (synth["sigma_hist"] <= hi)]
304
+ if not band_df.empty:
305
+ return band_df.copy(), True
306
+
307
+ # quantile fallback
308
+ q1 = synth["sigma_hist"].quantile(0.33)
309
+ q2 = synth["sigma_hist"].quantile(0.66)
310
+ b = (band or "medium").lower()
311
+ if b.startswith("low"):
312
+ band_df = synth[synth["sigma_hist"] <= q1]
313
+ elif b.startswith("high"):
314
+ band_df = synth[synth["sigma_hist"] >= q2]
315
+ else:
316
+ band_df = synth[(synth["sigma_hist"] > q1) & (synth["sigma_hist"] < q2)]
317
+ return band_df.copy(), True
318
+
319
+ # -------------- Embeddings & scoring (always on; fail gracefully) --------------
320
+ def _load_st_model():
321
  try:
322
  from sentence_transformers import SentenceTransformer
323
+ return SentenceTransformer("FinLang/finance-embeddings-investopedia")
324
  except Exception:
325
+ return None
 
 
 
 
 
 
326
 
327
+ def _encode(model, texts: List[str]) -> np.ndarray:
328
+ if model is None:
329
+ # Offline fallback: deterministic pseudo-embeddings from hash
330
+ rng = np.random.default_rng(42)
331
+ return rng.normal(size=(len(texts), 384)).astype(np.float32)
332
+ vecs = model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
333
+ return np.asarray(vecs, dtype=np.float32)
334
 
335
  def _portfolio_embedding(tickers: List[str], weights: List[float]) -> np.ndarray:
336
+ model = _load_st_model()
337
+ texts = [f"ticker {t}" for t in tickers]
338
+ embs = _encode(model, texts)
339
+ w = np.asarray(weights, dtype=float)
340
+ s = np.sum(np.abs(w)) or 1.0
341
+ w = np.abs(w) / s
342
+ v = (embs * w[:, None]).sum(axis=0, keepdims=False)
343
+ n = np.linalg.norm(v) or 1.0
344
+ return (v / n).astype(np.float32)
345
 
346
  def _cos_sim(a: np.ndarray, b: np.ndarray) -> float:
347
+ da = float(np.linalg.norm(a)); db = float(np.linalg.norm(b))
348
+ if da <= 1e-12 or db <= 1e-12: return 0.0
349
+ return float(np.dot(a, b) / (da * db))
350
+
351
+ def _exposure_similarity(u: Dict[str, float], c: Dict[str, float]) -> float:
352
+ # 1 - 0.5*L1 distance over common union; in [0,1]
353
+ keys = sorted(set(u.keys()) | set(c.keys()))
354
+ uvec = np.array([u.get(k, 0.0) for k in keys]); uvec = np.abs(uvec) / (np.sum(np.abs(uvec)) or 1.0)
355
+ cvec = np.array([c.get(k, 0.0) for k in keys]); cvec = np.abs(cvec) / (np.sum(np.abs(cvec)) or 1.0)
356
+ dist = float(np.sum(np.abs(uvec - cvec)))
357
+ return float(max(0.0, 1.0 - 0.5 * dist))
358
+
359
+ def _mmr_select(rows: pd.DataFrame, scores: np.ndarray, topk: int, lam: float) -> List[int]:
360
+ if len(rows) <= topk:
361
+ return list(range(len(rows)))
362
+ chosen = []
363
+ cand = list(range(len(rows)))
364
+ sims_cache = {}
365
+ # precompute embeddings of candidates for diversity
366
+ embs = []
367
+ for _, r in rows.iterrows():
368
+ ts = [t.strip().upper() for t in str(r["tickers"]).split(",")]
369
+ ws = [float(x) for x in str(r["weights"]).split(",")]
370
+ embs.append(_portfolio_embedding(ts, ws))
371
+ embs = np.stack(embs, axis=0)
372
+
373
+ while len(chosen) < topk and cand:
374
+ # pick argmax of lam*score - (1-lam)*max_sim_to_chosen
375
+ best_i = None; best_val = -1e9
376
+ for i in cand:
377
+ if not chosen:
378
+ val = float(scores[i])
379
+ else:
380
+ max_sim = max(_cos_sim(embs[i], embs[j]) for j in chosen)
381
+ val = lam * float(scores[i]) - (1.0 - lam) * float(max_sim)
382
+ if val > best_val:
383
+ best_val, best_i = val, i
384
+ chosen.append(best_i)
385
+ cand.remove(best_i)
386
+ return chosen
387
+
388
+ def pick_best_in_band(user_df: pd.DataFrame,
389
+ band_df: pd.DataFrame,
390
+ alpha: float = EMBED_ALPHA,
391
+ top_N: int = 50) -> pd.Series:
392
+ if band_df.empty:
393
+ return pd.Series(dtype="float64")
394
  try:
 
395
  band_df = band_df.sort_values("mu_capm", ascending=False).head(top_N).reset_index(drop=True)
396
 
397
  u_t = user_df["ticker"].astype(str).str.upper().tolist()
 
403
  for _, r in band_df.iterrows():
404
  ts = [t.strip().upper() for t in str(r["tickers"]).split(",")]
405
  ws = [float(x) for x in str(r["weights"]).split(",")]
406
+ s = sum(max(0.0, w) for w in ws) or 1.0
407
+ ws = [max(0.0, w) / s for w in ws]
408
+ c_map = {t: w for t, w in zip(ts, ws)}
409
  c_embed = _portfolio_embedding(ts, ws)
410
  expo_sim = _exposure_similarity(u_map, c_map)
411
  emb_sim = _cos_sim(u_embed, c_embed)
412
+ scores.append(alpha * expo_sim + (1.0 - alpha) * emb_sim)
413
 
414
+ # Take the best after MMR top-3 selection (but return only #1)
415
+ top_idxs = _mmr_select(band_df, np.asarray(scores), topk=3, lam=MMR_LAMBDA)
416
+ best_idx = top_idxs[0]
417
+ return band_df.iloc[best_idx]
418
  except Exception:
419
  return band_df.iloc[0]
420
 
421
  # -------------- UI helpers --------------
422
+ def empty_positions_df():
423
+ return pd.DataFrame(columns=["ticker", "amount_usd", "weight_exposure", "beta"])
424
+
425
+ def empty_suggestion_df():
426
+ return pd.DataFrame(columns=["ticker", "weight_%", "amount_$"])
427
 
428
  def set_horizon(years: float):
429
+ y = max(1.0, min(100.0, float(years)))
430
+ code = fred_series_for_horizon(y)
431
+ rf = fetch_fred_yield_annual(code)
432
  global HORIZON_YEARS, RF_CODE, RF_ANN
433
+ HORIZON_YEARS = y
434
+ RF_CODE = code
435
+ RF_ANN = rf
436
  return f"Risk-free series {code}. Latest annual rate {rf:.2%}."
437
 
438
  def search_tickers_cb(q: str):
 
442
 
443
  def add_symbol(selection: str, table: Optional[pd.DataFrame]):
444
  if not selection:
445
+ return table if isinstance(table, pd.DataFrame) else pd.DataFrame(columns=["ticker","amount_usd"]), "Pick a row in Matches first."
446
  symbol = selection.split("|")[0].strip().upper()
447
+
448
  current = []
449
+ if isinstance(table, pd.DataFrame) and not table.empty:
450
  current = [str(x).upper() for x in table["ticker"].tolist() if str(x) != "nan"]
451
  tickers = current if symbol in current else current + [symbol]
452
+
453
+ # do NOT auto-add MARKET_PROXY; validate uses it only for data fetch
454
  val = validate_tickers(tickers, years=DEFAULT_LOOKBACK_YEARS)
455
  tickers = [t for t in tickers if t in val]
456
+
457
  amt_map = {}
458
+ if isinstance(table, pd.DataFrame) and not table.empty:
459
  for _, r in table.iterrows():
460
+ t = str(r.get("ticker", "")).upper()
461
  if t in tickers:
462
+ amt_map[t] = float(pd.to_numeric(r.get("amount_usd", 0.0), errors="coerce") or 0.0)
463
+
464
+ new_table = pd.DataFrame({"ticker": tickers, "amount_usd": [amt_map.get(t, 0.0) for t in tickers]})
465
  if len(new_table) > MAX_TICKERS:
466
+ new_table = new_table.iloc[:MAX_TICKERS]
467
+ return new_table, f"Reached max of {MAX_TICKERS}."
468
  return new_table, f"Added {symbol}."
469
 
470
  def lock_ticker_column(tb: Optional[pd.DataFrame]):
471
+ if not isinstance(tb, pd.DataFrame) or tb.empty:
472
+ return pd.DataFrame(columns=["ticker", "amount_usd"])
473
  tickers = [str(x).upper() for x in tb["ticker"].tolist()]
474
  amounts = pd.to_numeric(tb["amount_usd"], errors="coerce").fillna(0.0).tolist()
475
  val = validate_tickers(tickers, years=DEFAULT_LOOKBACK_YEARS)
476
  tickers = [t for t in tickers if t in val]
477
+ amounts = amounts[:len(tickers)] + [0.0] * max(0, len(tickers) - len(amounts))
478
  return pd.DataFrame({"ticker": tickers, "amount_usd": amounts})
479
 
480
+ # ---- compute all once; then we switch the displayed band without recomputing ----
481
+ def compute_all(
482
+ years_lookback: int,
483
+ table: Optional[pd.DataFrame],
484
+ risk_horizon_years: float
485
+ ):
486
+ # sanitize table
487
+ if isinstance(table, pd.DataFrame):
488
+ df = table.copy()
489
+ else:
490
+ df = pd.DataFrame(columns=["ticker", "amount_usd"])
 
 
 
 
491
  df = df.dropna(how="all")
492
  if "ticker" not in df.columns: df["ticker"] = []
493
  if "amount_usd" not in df.columns: df["amount_usd"] = []
 
495
  df["amount_usd"] = pd.to_numeric(df["amount_usd"], errors="coerce").fillna(0.0)
496
 
497
  symbols = [t for t in df["ticker"].tolist() if t]
498
+ if len(symbols) == 0:
499
+ return {"error": "Add at least one ticker."}
 
500
 
501
+ symbols = validate_tickers(symbols, years_lookback)
502
+ if len(symbols) == 0:
503
+ return {"error": f"Could not validate any tickers (also need market data for {MARKET_PROXY})."}
504
 
505
+ amounts = {t: float(df[df["ticker"] == t]["amount_usd"].iloc[0]) for t in symbols}
 
506
  rf_ann = RF_ANN
507
 
508
+ # Moments vs market proxy
509
  moms = estimate_all_moments_aligned(symbols, years_lookback, rf_ann)
510
+ betas, covA, erp_ann, sigma_mkt = moms["betas"], moms["cov_ann"], moms["erp_ann"], moms["sigma_m_ann"]
511
 
512
+ # Weights (exposures)
513
  gross = sum(abs(v) for v in amounts.values())
514
+ if gross <= 1e-12:
515
+ return {"error": "All amounts are zero."}
516
+ weights = {k: v / gross for k, v in amounts.items()}
517
 
518
+ # Portfolio CAPM stats
519
+ beta_p, mu_capm, sigma_hist = portfolio_stats(weights, covA, betas, rf_ann, erp_ann)
520
 
521
+ # Efficient alternatives (market/bills)
522
  a_sigma, b_sigma, mu_eff_sigma = efficient_same_sigma(sigma_hist, rf_ann, erp_ann, sigma_mkt)
523
  a_mu, b_mu, sigma_eff_mu = efficient_same_return(mu_capm, rf_ann, erp_ann, sigma_mkt)
524
 
525
+ # Synthetic dataset & suggestions (universe = user's tickers only)
526
+ synth = build_synthetic_dataset(symbols, covA, betas, rf_ann, erp_ann, sigma_mkt, n_rows=SYNTH_ROWS)
527
  csv_path = os.path.join(DATA_DIR, f"investor_profiles_{int(time.time())}.csv")
528
+ try:
529
+ synth.to_csv(csv_path, index=False)
530
+ except Exception:
531
+ csv_path = None
532
+
533
+ # one suggestion per band
534
+ def best_for_band(band: str):
535
+ band_df, used_fallback = candidates_for_band(synth, sigma_mkt, band)
536
+ user_df = pd.DataFrame({"ticker": list(weights.keys()),
537
+ "amount_usd": [amounts[t] for t in weights.keys()]})
538
+ row = pick_best_in_band(user_df, band_df, EMBED_ALPHA, top_N=50)
539
+ return row, used_fallback
540
+
541
+ best_low, low_fb = best_for_band("Low")
542
+ best_med, med_fb = best_for_band("Medium")
543
+ best_high, high_fb = best_for_band("High")
544
+
545
+ # positions table
546
+ pos_table = pd.DataFrame(
547
+ [{
548
+ "ticker": t,
549
+ "amount_usd": amounts.get(t, 0.0),
550
+ "weight_exposure": weights.get(t, 0.0),
551
+ "beta": betas.get(t, np.nan)
552
+ } for t in symbols],
553
+ columns=["ticker", "amount_usd", "weight_exposure", "beta"]
554
+ )
555
 
556
  info = "\n".join([
557
  "### Inputs",
 
570
  f"- Same Οƒ as your portfolio: Market {a_sigma:.2f}, Bills {b_sigma:.2f} β†’ E[r] {mu_eff_sigma:.2%}",
571
  f"- Same E[r] as your portfolio: Market {a_mu:.2f}, Bills {b_mu:.2f} β†’ Οƒ {sigma_eff_mu:.2%}",
572
  "",
573
+ "_All plotted points are on/under the CML; if CAPM E[r] exceeds the CML at a given Οƒ, we clamp to CML for visualization._"
 
574
  ])
575
 
576
+ outs = dict(
577
+ ok=True,
578
+ rf_ann=rf_ann, erp_ann=erp_ann, sigma_mkt=sigma_mkt,
579
+ sigma_hist=sigma_hist, mu_capm=mu_capm,
580
+ same_sigma_mu=mu_eff_sigma, same_mu_sigma=sigma_eff_mu,
581
+ positions=pos_table, csv_path=csv_path, symbols=symbols,
582
+ amounts=amounts, weights=weights,
583
+ best_low=best_low, best_med=best_med, best_high=best_high,
584
+ low_fb=low_fb, med_fb=med_fb, high_fb=high_fb,
585
+ budget=gross
586
+ )
587
+ return outs
588
+
589
+ def _row_to_table(row: pd.Series, budget: float) -> pd.DataFrame:
590
+ if row is None or row.empty:
591
+ return empty_suggestion_df()
592
+ ts = [t.strip().upper() for t in str(row["tickers"]).split(",")]
593
+ ws = [float(x) for x in str(row["weights"]).split(",")]
594
+ s = sum(max(0.0, w) for w in ws) or 1.0
595
+ ws = [max(0.0, w) / s for w in ws]
596
+ return pd.DataFrame(
597
+ [{"ticker": t, "weight_%": round(w*100.0, 2), "amount_$": round(w*budget, 0)} for t, w in zip(ts, ws)],
598
+ columns=["ticker", "weight_%", "amount_$"]
599
+ )
600
+
601
+ def _band_stats(label: str, s: pd.Series, used_fallback: bool) -> str:
602
+ if s is None or s.empty:
603
+ return f"**{label}:** β€”"
604
+ tag = " *(fallback)*" if used_fallback else ""
605
+ return (f"**{label}:** CAPM E[r] {float(s['mu_capm'])*100:.2f}%, "
606
+ f"Οƒ(h) {float(s['sigma_hist'])*100:.2f}%{tag}")
607
+
608
+ def render_with_band(outs: dict, band: str):
609
+ if not outs.get("ok", False):
610
+ msg = outs.get("error", "Unknown error.")
611
+ return None, msg, msg, empty_positions_df(), empty_suggestion_df(), None, "β€”", "β€”", "β€”"
612
+
613
+ rf_ann, erp_ann, sigma_mkt = outs["rf_ann"], outs["erp_ann"], outs["sigma_mkt"]
614
+ sigma_hist, mu_capm = outs["sigma_hist"], outs["mu_capm"]
615
+ same_sigma_mu, same_mu_sigma = outs["same_sigma_mu"], outs["same_mu_sigma"]
616
+
617
+ pick = outs["best_low"] if band == "Low" else outs["best_high"] if band == "High" else outs["best_med"]
618
+ sugg_sigma = float(pick["sigma_hist"]) if (pick is not None and not pick.empty) else None
619
+ sugg_mu = float(pick["mu_capm"]) if (pick is not None and not pick.empty) else None
620
+
621
+ img = plot_cml(
622
+ rf_ann, erp_ann, sigma_mkt,
623
+ sigma_hist, mu_capm,
624
+ same_sigma_mu, same_mu_sigma,
625
+ sugg_sigma_hist=sugg_sigma, sugg_mu_capm=sugg_mu
626
  )
627
 
628
+ low_stats = _band_stats("Low", outs["best_low"], outs["low_fb"])
629
+ med_stats = _band_stats("Medium", outs["best_med"], outs["med_fb"])
630
+ high_stats = _band_stats("High", outs["best_high"], outs["high_fb"])
 
631
 
632
+ sugg_table = _row_to_table(pick, outs["budget"])
633
+ positions = outs["positions"]
634
+ csv_path = outs["csv_path"]
635
+
636
+ # We also show universe status as text
637
+ uni_msg = f"Universe set to: {', '.join(outs['symbols'])}"
638
+ summary = "\n" + (render_summary_text := "") # placeholder so we keep existing 'info' below
639
+
640
+ # Use the prebuilt summary string from compute_all for the right panel
641
+ info_lines = [
642
+ "### Inputs",
643
+ f"- Lookback years {int(DEFAULT_LOOKBACK_YEARS)}",
644
+ f"- Horizon years {int(round(HORIZON_YEARS))}",
645
+ f"- Risk-free {rf_ann:.2%} from {RF_CODE}",
646
+ f"- Market ERP {erp_ann:.2%}",
647
+ f"- Market Οƒ (hist) {sigma_mkt:.2%}",
648
+ "",
649
+ "### Your portfolio (CAPM on CML; x=Οƒ_hist, y=CAPM E[r])",
650
+ f"- CAPM E[r] {mu_capm:.2%}",
651
+ f"- Οƒ (historical) {sigma_hist:.2%}",
652
+ "",
653
+ "### Efficient market/bills mixes",
654
+ f"- Same Οƒ: E[r] {same_sigma_mu:.2%}",
655
+ f"- Same E[r]: Οƒ {same_mu_sigma:.2%}",
656
+ ]
657
+ info = "\n".join(info_lines)
658
 
659
+ return img, info, uni_msg, positions, sugg_table, csv_path, low_stats, med_stats, high_stats
660
 
661
  # -------------- UI --------------
662
  with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
663
  gr.Markdown(
664
  "## Efficient Portfolio Advisor\n"
665
+ "Enter **$ amounts** for your tickers (negatives allowed), set horizon. "
666
+ "Plot shows your **CAPM point on the CML** using historical Οƒ on the x-axis. "
667
+ "Suggestions are generated from your tickers only; embeddings + MMR are always on."
 
668
  )
669
+
670
+ state = gr.State(value=None) # stores compute_all outputs
671
+
672
  with gr.Row():
673
  with gr.Column(scale=1):
674
+ q = gr.Textbox(label="Search symbol")
675
+ search_note = gr.Markdown()
676
  matches = gr.Dropdown(choices=[], label="Matches")
677
+ search_btn = gr.Button("Search")
678
+ add_btn = gr.Button("Add selected to portfolio")
679
+
680
  gr.Markdown("### Portfolio positions (enter $ amounts; negatives allowed)")
681
+ table = gr.Dataframe(
682
+ headers=["ticker", "amount_usd"],
683
+ datatype=["str", "number"],
684
+ row_count=0,
685
+ col_count=(2, "fixed")
686
+ )
687
+
688
  horizon = gr.Number(label="Horizon in years (1–100)", value=HORIZON_YEARS, precision=0)
689
+ lookback = gr.Slider(1, 15, value=DEFAULT_LOOKBACK_YEARS, step=1, label="Lookback years for betas & covariances")
690
 
691
  gr.Markdown("### Suggestions (one per band)")
692
  with gr.Row():
693
+ btn_low = gr.Button("Show Low")
694
+ btn_med = gr.Button("Show Medium")
695
+ btn_high = gr.Button("Show High")
696
+
697
+ low_line = gr.Markdown(value="**Low:** β€”")
698
+ med_line = gr.Markdown(value="**Medium:** β€”")
699
+ high_line = gr.Markdown(value="**High:** β€”")
700
 
701
  run_btn = gr.Button("Compute (build dataset & suggest)")
702
  with gr.Column(scale=1):
703
  plot = gr.Image(label="Capital Market Line (CAPM)", type="pil")
704
  summary = gr.Markdown(label="Inputs & Results")
705
  universe_msg = gr.Textbox(label="Universe status", interactive=False)
706
+ positions = gr.Dataframe(
707
+ label="Computed positions",
708
+ headers=["ticker", "amount_usd", "weight_exposure", "beta"],
709
+ datatype=["str", "number", "number", "number"],
710
+ col_count=(4, "fixed"),
711
+ value=empty_positions_df(),
712
+ interactive=False
713
+ )
714
+ sugg_table = gr.Dataframe(
715
+ label="Selected suggestion β€” holdings shown in % and $ (from *your* tickers only)",
716
+ headers=["ticker", "weight_%", "amount_$"],
717
+ datatype=["str", "number", "number"],
718
+ col_count=(3, "fixed"),
719
+ value=empty_suggestion_df(),
720
+ interactive=False
721
+ )
722
  dl = gr.File(label="Generated dataset CSV", value=None, visible=True)
723
 
724
  # wire search / add / locking / horizon
 
727
  table.change(fn=lock_ticker_column, inputs=table, outputs=table)
728
  horizon.change(fn=set_horizon, inputs=horizon, outputs=universe_msg)
729
 
730
+ # main compute
731
+ def _compute_and_show(lookback_v, table_v, horizon_v):
732
+ outs = compute_all(int(lookback_v), table_v, float(horizon_v))
733
+ if not outs.get("ok", False):
734
+ err = outs.get("error", "Unable to compute.")
735
+ # return blank UI + error in summary
736
+ return (outs, None, f"**Error:** {err}", err,
737
+ empty_positions_df(), empty_suggestion_df(), None, "β€”","β€”","β€”")
738
+ # default show Medium
739
+ img, info, uni_msg, pos, st, csv_path, low_s, med_s, high_s = render_with_band(outs, "Medium")
740
+ return (outs, img, info, uni_msg, pos, st, csv_path, low_s, med_s, high_s)
741
+
742
  run_btn.click(
743
+ fn=_compute_and_show,
744
+ inputs=[lookback, table, horizon],
745
+ outputs=[state, plot, summary, universe_msg, positions, sugg_table, dl, low_line, med_line, high_line]
746
  )
747
 
748
+ # band buttons (no recompute; reuse state)
749
+ def _show_band(outs, band):
750
+ if outs is None:
751
+ return None, "Click Compute first.", "", empty_positions_df(), empty_suggestion_df(), None
752
+ return render_with_band(outs, band)
753
+
754
+ btn_low.click(fn=_show_band, inputs=[state, gr.Textbox(value="Low", visible=False)],
755
+ outputs=[plot, summary, universe_msg, positions, sugg_table, dl, low_line, med_line, high_line])
756
+ btn_med.click(fn=_show_band, inputs=[state, gr.Textbox(value="Medium", visible=False)],
757
+ outputs=[plot, summary, universe_msg, positions, sugg_table, dl, low_line, med_line, high_line])
758
+ btn_high.click(fn=_show_band, inputs=[state, gr.Textbox(value="High", visible=False)],
759
+ outputs=[plot, summary, universe_msg, positions, sugg_table, dl, low_line, med_line, high_line])
 
 
 
 
760
 
761
  # initialize risk-free at launch
762
  RF_CODE = fred_series_for_horizon(HORIZON_YEARS)
763
  RF_ANN = fetch_fred_yield_annual(RF_CODE)
764
 
765
  if __name__ == "__main__":
766
+ # No concurrency_count here (Gradio 5); let the platform set host/port
767
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False)