Tulitula commited on
Commit
d020540
·
verified ·
1 Parent(s): bffd7d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +334 -321
app.py CHANGED
@@ -1,6 +1,13 @@
1
-
2
  # app.py
3
- import os, io, math, json, warnings, hashlib, random
 
 
 
 
 
 
 
 
4
  warnings.filterwarnings("ignore")
5
 
6
  from typing import List, Tuple, Dict, Optional
@@ -8,26 +15,24 @@ from typing import List, Tuple, Dict, Optional
8
  import numpy as np
9
  import pandas as pd
10
  import matplotlib.pyplot as plt
11
- import gradio as gr
12
  from PIL import Image
13
  import requests
 
14
  import yfinance as yf
15
 
16
- from sklearn.neighbors import KNeighborsRegressor
17
- from sklearn.preprocessing import StandardScaler
18
 
19
- # ---------------- config ----------------
20
  DATA_DIR = "data"
21
  os.makedirs(DATA_DIR, exist_ok=True)
22
 
 
23
  MAX_TICKERS = 30
24
  DEFAULT_LOOKBACK_YEARS = 10
25
- MARKET_TICKER = "VOO" # fall back to SPY if needed
26
 
27
- # UI tables
28
- POS_COLS = ["ticker", "amount_usd", "weight_exposure", "beta"]
29
-
30
- # FRED curve mapping: horizon -> series code
31
  FRED_MAP = [
32
  (1, "DGS1"),
33
  (2, "DGS2"),
@@ -40,7 +45,25 @@ FRED_MAP = [
40
  (100, "DGS30"),
41
  ]
42
 
43
- # ---------------- helpers ----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  def fred_series_for_horizon(years: float) -> str:
45
  y = max(1.0, min(100.0, float(years)))
46
  for cutoff, code in FRED_MAP:
@@ -59,31 +82,61 @@ def fetch_fred_yield_annual(code: str) -> float:
59
  except Exception:
60
  return 0.03
61
 
 
62
  def fetch_prices_monthly(tickers: List[str], years: int) -> pd.DataFrame:
63
  start = pd.Timestamp.today(tz="UTC") - pd.DateOffset(years=years, days=7)
64
  end = pd.Timestamp.today(tz="UTC")
65
- df = yf.download(
 
66
  list(dict.fromkeys(tickers)),
67
  start=start.date(),
68
  end=end.date(),
69
  interval="1mo",
70
- auto_adjust=True,
71
  progress=False,
72
  group_by="ticker",
73
- )["Close"]
74
- if isinstance(df, pd.Series):
75
- df = df.to_frame()
76
- df = df.dropna(how="all").fillna(method="ffill")
77
- # If yfinance returns MultiIndex columns for multiple tickers, flatten
78
- if isinstance(df.columns, pd.MultiIndex):
79
- df.columns = [c[0] for c in df.columns]
80
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  def monthly_returns(prices: pd.DataFrame) -> pd.DataFrame:
83
  return prices.pct_change().dropna()
84
 
 
 
 
 
 
 
 
85
  def yahoo_search(query: str):
86
- if not query or len(query.strip()) == 0:
87
  return []
88
  url = "https://query1.finance.yahoo.com/v1/finance/search"
89
  params = {"q": query.strip(), "quotesCount": 10, "newsCount": 0}
@@ -100,52 +153,35 @@ def yahoo_search(query: str):
100
  if sym and sym.isascii():
101
  out.append({"symbol": sym, "name": name, "exchange": exch})
102
  if not out:
103
- out = [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "n/a"}]
104
  return out[:10]
105
  except Exception:
106
- return [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": "n/a"}]
107
 
108
  def validate_tickers(symbols: List[str], years: int) -> List[str]:
109
- if not symbols:
110
- return []
111
- # Always include market proxy so alignment works
112
- base = [s for s in dict.fromkeys(symbols)]
113
  px = fetch_prices_monthly(base + [MARKET_TICKER], years)
114
  ok = [s for s in base if s in px.columns]
115
- # If market ticker missing, try SPY as fallback
116
- if MARKET_TICKER not in px.columns and "SPY" not in px.columns:
117
- # Try once more with SPY added
118
- px2 = fetch_prices_monthly(base + ["SPY"], years)
119
- ok = [s for s in base if s in px2.columns]
120
  return ok
121
 
122
- # -------------- aligned moments --------------
123
  def get_aligned_monthly_returns(symbols: List[str], years: int) -> pd.DataFrame:
124
- uniq = [c for c in dict.fromkeys(symbols) if c]
125
  tickers = uniq + [MARKET_TICKER]
126
  px = fetch_prices_monthly(tickers, years)
127
- # if VOO missing, try SPY as market
128
- mkt = MARKET_TICKER if MARKET_TICKER in px.columns else ("SPY" if "SPY" in px.columns else None)
129
- if mkt is None:
130
- return pd.DataFrame()
131
  rets = monthly_returns(px)
132
- cols = [c for c in uniq if c in rets.columns] + [mkt]
133
  R = rets[cols].dropna(how="any")
134
- return R, mkt
135
-
136
- def annualize_mean(m):
137
- return np.asarray(m, dtype=float) * 12.0
138
-
139
- def annualize_sigma(s):
140
- return np.asarray(s, dtype=float) * math.sqrt(12.0)
141
 
142
  def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
143
- R, mkt = get_aligned_monthly_returns(symbols, years)
144
- if R is None or R.empty or mkt is None or R.shape[0] < 3:
145
- raise ValueError("Not enough aligned data for selected tickers / lookback.")
146
  rf_m = rf_ann / 12.0
147
 
148
- m = R[mkt]
149
  if isinstance(m, pd.DataFrame):
150
  m = m.iloc[:, 0].squeeze()
151
 
@@ -158,17 +194,16 @@ def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
158
  var_m = max(var_m, 1e-6)
159
 
160
  betas: Dict[str, float] = {}
161
- for s in [c for c in R.columns if c != mkt]:
162
  ex_s = R[s] - rf_m
163
  betas[s] = float(np.cov(ex_s.values, ex_m.values, ddof=1)[0, 1] / var_m)
 
164
 
165
- betas[mkt] = 1.0
166
- # asset covariance (annualized) excluding market column
167
- asset_cols = [c for c in R.columns if c != mkt]
168
  cov_m = np.cov(R[asset_cols].values.T, ddof=1) if asset_cols else np.zeros((0, 0))
169
  covA = pd.DataFrame(cov_m * 12.0, index=asset_cols, columns=asset_cols)
170
 
171
- return {"betas": betas, "cov_ann": covA, "erp_ann": erp_ann, "sigma_m_ann": sigma_m_ann, "mkt": mkt}
172
 
173
  def capm_er(beta: float, rf_ann: float, erp_ann: float) -> float:
174
  return float(rf_ann + beta * erp_ann)
@@ -179,19 +214,20 @@ def portfolio_stats(weights: Dict[str, float],
179
  rf_ann: float,
180
  erp_ann: float) -> Tuple[float, float, float]:
181
  tickers = list(weights.keys())
 
 
182
  w = np.array([weights[t] for t in tickers], dtype=float)
183
  gross = float(np.sum(np.abs(w)))
184
- if gross == 0:
185
  return 0.0, 0.0, 0.0
186
  w_expo = w / gross
187
  beta_p = float(np.dot([betas.get(t, 0.0) for t in tickers], w_expo))
188
  er_p = capm_er(beta_p, rf_ann, erp_ann)
189
  cov = cov_ann.reindex(index=tickers, columns=tickers).fillna(0.0).to_numpy()
190
- v = float(w_expo.T @ cov @ w_expo)
191
- sigma_p = math.sqrt(max(v, 0.0))
192
  return beta_p, er_p, sigma_p
193
 
194
- # -------------- CML helpers --------------
195
  def efficient_same_sigma(sigma_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
196
  if sigma_mkt <= 1e-12:
197
  return 0.0, 1.0, rf_ann
@@ -204,55 +240,43 @@ def efficient_same_return(mu_target: float, rf_ann: float, erp_ann: float, sigma
204
  a = (mu_target - rf_ann) / erp_ann
205
  return a, 1.0 - a, abs(a) * sigma_mkt
206
 
207
- def plot_cml(
208
  rf_ann, erp_ann, sigma_mkt,
209
  pt_sigma, pt_mu,
210
  same_sigma_sigma, same_sigma_mu,
211
  same_mu_sigma, same_mu_mu,
212
- sugg_sigma=None, sugg_mu=None
213
  ) -> Image.Image:
214
- fig = plt.figure(figsize=(6.2, 4.2), dpi=120)
215
 
216
  xmax = max(
217
- 0.30,
218
  sigma_mkt * 2.0,
219
  pt_sigma * 1.4,
220
  same_mu_sigma * 1.4,
221
  same_sigma_sigma * 1.4,
222
- (sugg_sigma or 0.0) * 1.4,
223
  )
224
  xs = np.linspace(0, xmax, 160)
225
  slope = erp_ann / max(sigma_mkt, 1e-12)
226
  cml = rf_ann + slope * xs
227
- plt.plot(xs * 100.0, cml * 100.0, label="CML via Market")
228
-
229
- # key points
230
- plt.scatter([0.0], [rf_ann * 100.0], label="Risk-free (FRED)")
231
- plt.scatter([sigma_mkt * 100.0], [(rf_ann + erp_ann) * 100.0], label="Market (VOO)")
232
- plt.scatter([pt_sigma * 100.0], [pt_mu * 100.0], label="Your portfolio")
233
-
234
- plt.scatter([same_sigma_sigma * 100.0], [same_sigma_mu * 100.0], label="Efficient same sigma")
235
- plt.scatter([same_mu_sigma * 100.0], [same_mu_mu * 100.0], label="Efficient same return")
236
-
237
- if sugg_sigma is not None and sugg_mu is not None:
238
- plt.scatter([sugg_sigma * 100.0], [sugg_mu * 100.0], label="Suggestion")
239
-
240
- # simple guides
241
- plt.plot(
242
- [pt_sigma * 100.0, same_sigma_sigma * 100.0],
243
- [pt_mu * 100.0, same_sigma_mu * 100.0],
244
- linestyle="--", linewidth=1.1, alpha=0.7, color="gray",
245
- )
246
- plt.plot(
247
- [pt_sigma * 100.0, same_mu_sigma * 100.0],
248
- [pt_mu * 100.0, same_mu_mu * 100.0],
249
- linestyle="--", linewidth=1.1, alpha=0.7, color="gray",
250
- )
251
-
252
- plt.xlabel("σ (annualized)")
253
- plt.ylabel("Expected return (annual)")
254
- plt.gca().xaxis.set_major_formatter(lambda v, pos: f"{v:.0f}%")
255
- plt.gca().yaxis.set_major_formatter(lambda v, pos: f"{v:.0f}%")
256
  plt.legend(loc="best", fontsize=8)
257
  plt.tight_layout()
258
 
@@ -262,10 +286,10 @@ def plot_cml(
262
  buf.seek(0)
263
  return Image.open(buf)
264
 
265
- # -------------- synthetic dataset --------------
266
- def _row_to_exposures(row: pd.Series, universe: List[str]) -> Optional[np.ndarray]:
267
  try:
268
- ts = [t.strip().upper() for t in str(row["tickers"]).split(",") if t.strip()]
269
  ws = [float(x) for x in str(row["weights"]).split(",")]
270
  wmap = {t: ws[i] for i, t in enumerate(ts) if i < len(ws)}
271
  w = np.array([wmap.get(t, 0.0) for t in universe], dtype=float)
@@ -276,163 +300,131 @@ def _row_to_exposures(row: pd.Series, universe: List[str]) -> Optional[np.ndarra
276
  except Exception:
277
  return None
278
 
279
- def build_synthetic_dataset(universe: List[str], years: int, rf_ann: float, erp_ann: float) -> pd.DataFrame:
280
- symbols = list(sorted(set([s for s in universe if s])))
281
- moms = estimate_all_moments_aligned(symbols, years, rf_ann)
282
  covA, betas = moms["cov_ann"], moms["betas"]
283
 
284
- rows, rng = [], np.random.default_rng(12345)
285
- for i in range(1000):
286
- k = int(rng.integers(low=min(2, len(symbols)), high=min(8, len(symbols)) + 1))
287
- picks = list(rng.choice(symbols, size=k, replace=False))
288
- signs = rng.choice([-1.0, 1.0], size=k, p=[0.25, 0.75])
 
289
  raw = rng.dirichlet(np.ones(k))
290
- gross = 1.0 + float(rng.gamma(2.0, 0.7))
291
  w = gross * signs * raw
 
292
  beta_p, er_p, sigma_p = portfolio_stats({picks[j]: w[j] for j in range(k)}, covA, betas, rf_ann, erp_ann)
293
  rows.append({
294
  "id": i,
295
  "tickers": ",".join(picks),
296
  "weights": ",".join(f"{x:.6f}" for x in w),
297
- "beta_p": beta_p,
298
  "er_p": er_p,
299
- "sigma_p": sigma_p
 
300
  })
301
  return pd.DataFrame(rows)
302
 
303
- def save_synth_csv(df: pd.DataFrame, universe: List[str]) -> str:
304
- sig = hashlib.md5((",".join(sorted(universe)) + f":{len(df)}").encode()).hexdigest()[:8]
305
- path = os.path.join(DATA_DIR, f"investor_profiles_{sig}.csv")
306
- df.to_csv(path, index=False)
307
- return path
308
-
309
- # -------------- suggestion logic (dataset only, optional embeddings) --------------
310
- def describe_candidate_text(row: pd.Series, universe: List[str]) -> str:
311
- xs = _row_to_exposures(row, universe)
312
- if xs is None:
313
- return ""
314
- parts = []
315
- for t, w in sorted(zip(universe, xs), key=lambda z: -abs(z[1]))[:8]:
316
- if abs(w) > 1e-4:
317
- parts.append(f"{t} {w:+.2f}")
318
- desc = " ".join(parts)
319
- return f"weights {desc}; beta {row['beta_p']:.2f}; sigma {row['sigma_p']:.2f}; return {row['er_p']:.2f}"
320
-
321
- def pick_by_risk_from_dataset(csv_path: str,
322
- universe: List[str],
323
- risk_label: str,
324
- use_embeddings: bool) -> Optional[Dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  try:
326
  df = pd.read_csv(csv_path)
327
  except Exception:
328
- return None
 
329
  if df.empty:
330
- return None
331
 
332
- # candidates by sigma
333
- sigmas = df["sigma_p"].astype(float).values
334
- order_low = np.argsort(sigmas)
335
- order_high = order_low[::-1]
336
- med_value = float(np.median(sigmas))
337
- order_mid = np.argsort(np.abs(sigmas - med_value))
338
-
339
- if risk_label.lower() == "low":
340
- idxs = order_low[:30]
341
- elif risk_label.lower() == "high":
342
- idxs = order_high[:30]
343
- else:
344
- idxs = order_mid[:30]
345
 
346
- sub = df.iloc[idxs].copy()
347
- if sub.empty:
348
- return None
 
 
 
349
 
350
- # optional: rerank with finance embeddings against a risk prompt
351
  if use_embeddings:
352
- prompt_map = {
353
- "low": "low risk, stable, diversified, defensive, downside protection",
354
- "medium": "balanced risk, moderate volatility, diversified growth and income",
355
- "high": "high risk, aggressive growth, momentum, high volatility"
356
- }
357
- prompt = prompt_map.get(risk_label.lower(), prompt_map["medium"])
358
- try:
359
- from sentence_transformers import SentenceTransformer, util
360
- model = SentenceTransformer("FinLang/finance-embeddings-investopedia")
361
- cand_texts = [describe_candidate_text(r, universe) for _, r in sub.iterrows()]
362
- emb_prompt = model.encode([prompt], normalize_embeddings=True)
363
- emb_cands = model.encode(cand_texts, normalize_embeddings=True)
364
- sims = util.cos_sim(emb_prompt, emb_cands).cpu().numpy()[0]
365
- best_i = int(np.argsort(-sims)[0])
366
- chosen = sub.iloc[best_i]
367
- except Exception:
368
- chosen = sub.iloc[0]
369
- else:
370
- chosen = sub.iloc[0]
 
 
 
 
 
 
 
 
 
 
 
371
 
372
- # convert chosen row to exposure map on universe
373
- xs = _row_to_exposures(chosen, universe)
374
- if xs is None:
375
- return None
376
- wmap = {t: float(xs[i]) for i, t in enumerate(universe) if abs(xs[i]) > 1e-4}
377
- return {"weights": wmap,
378
- "er": float(chosen["er_p"]),
379
- "sigma": float(chosen["sigma_p"]),
380
- "beta": float(chosen["beta_p"])}
381
-
382
- def build_simple_suggestion_table(weights_exposure: Dict[str, float],
383
- gross_capital: float,
384
- top_n: int = 12) -> pd.DataFrame:
385
- rows = []
386
- for t, w in sorted(weights_exposure.items(), key=lambda kv: -abs(kv[1]))[:top_n]:
387
- rows.append({
388
- "ticker": t,
389
- "weight_%": round(float(w) * 100.0, 2),
390
- "dollars_$": round(float(w) * float(gross_capital), 0)
391
- })
392
- return pd.DataFrame(rows, columns=["ticker", "weight_%", "dollars_$"])
393
-
394
- # -------------- summary builder --------------
395
- def fmt_pct(x: float) -> str:
396
- return f"{x*100:.2f}%"
397
-
398
- def build_summary_md(lookback, horizon, rf, rf_code, erp, sigma_mkt,
399
- beta_p, er_p, sigma_p,
400
- a_sigma, b_sigma, mu_eff_sigma,
401
- a_mu, b_mu, sigma_eff_mu,
402
- sugg=None, risk_label=None) -> str:
403
- lines = []
404
- lines.append("### Inputs")
405
- lines.append(f"- Lookback years: **{lookback}**")
406
- lines.append(f"- Horizon years: **{int(round(horizon))}**")
407
- lines.append(f"- Risk-free: **{fmt_pct(rf)}** (FRED {rf_code})")
408
- lines.append(f"- Market ERP: **{fmt_pct(erp)}**")
409
- lines.append(f"- Market σ: **{fmt_pct(sigma_mkt)}**")
410
- lines.append("")
411
- lines.append("### Your portfolio (CAPM expectations)")
412
- lines.append(f"- Beta: **{beta_p:.2f}**")
413
- lines.append(f"- σ: **{fmt_pct(sigma_p)}**")
414
- lines.append(f"- Expected return: **{fmt_pct(er_p)}**")
415
- lines.append("")
416
- lines.append("### Efficient alternatives on CML")
417
- lines.append("Same σ as your portfolio")
418
- lines.append(f"- Market weight **{a_sigma:.2f}**, Bills weight **{b_sigma:.2f}**")
419
- lines.append(f"- Expected return **{fmt_pct(mu_eff_sigma)}**")
420
- lines.append("Same μ as your portfolio")
421
- lines.append(f"- Market weight **{a_mu:.2f}**, Bills weight **{b_mu:.2f}**")
422
- lines.append(f"- σ **{fmt_pct(sigma_eff_mu)}**")
423
- if sugg is not None:
424
- lines.append("")
425
- lines.append(f"### Dataset-based suggestion (risk: **{risk_label}**)")
426
- lines.append(f"- Suggested β **{sugg['beta']:.2f}**, σ **{fmt_pct(sugg['sigma'])}**, μ **{fmt_pct(sugg['er'])}**")
427
- return "\n".join(lines)
428
-
429
- # -------------- global state --------------
430
- UNIVERSE = [MARKET_TICKER, "QQQ", "XLK", "XLP", "XLE", "VNQ", "IEF", "HYG", "GLD", "EEM"]
431
- HORIZON_YEARS = 10
432
- RF_CODE = fred_series_for_horizon(HORIZON_YEARS)
433
- RF_ANN = fetch_fred_yield_annual(RF_CODE)
434
 
435
- # -------------- gradio callbacks --------------
436
  def search_tickers_cb(q: str):
437
  hits = yahoo_search(q)
438
  if not hits:
@@ -442,12 +434,13 @@ def search_tickers_cb(q: str):
442
 
443
  def add_symbol(selection: str, table: pd.DataFrame):
444
  if not selection:
445
- return table, "Pick a row from Matches first"
446
  symbol = selection.split("|")[0].strip().upper()
447
  current = [] if table is None or len(table) == 0 else [str(x).upper() for x in table["ticker"].tolist() if str(x) != "nan"]
448
  tickers = current if symbol in current else current + [symbol]
449
  val = validate_tickers(tickers, years=DEFAULT_LOOKBACK_YEARS)
450
  tickers = [t for t in tickers if t in val]
 
451
  amt_map = {}
452
  if table is not None and len(table) > 0:
453
  for _, r in table.iterrows():
@@ -471,6 +464,11 @@ def lock_ticker_column(tb: pd.DataFrame):
471
  amounts = amounts[:len(tickers)] + [0.0] * max(0, len(tickers) - len(amounts))
472
  return pd.DataFrame({"ticker": tickers, "amount_usd": amounts})
473
 
 
 
 
 
 
474
  def set_horizon(years: float):
475
  y = max(1.0, min(100.0, float(years)))
476
  code = fred_series_for_horizon(y)
@@ -479,141 +477,157 @@ def set_horizon(years: float):
479
  HORIZON_YEARS = y
480
  RF_CODE = code
481
  RF_ANN = rf
482
- return f"Risk-free series {code}. Latest annual rate {rf:.2%}. Will be used on compute."
483
 
484
- def compute(lookback: int,
485
  table: pd.DataFrame,
486
- risk_label: str,
487
  use_embeddings: bool):
488
-
489
- if table is None or len(table) == 0:
490
- return None, "Add at least one ticker", "Universe empty", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=["ticker","weight_%","dollars_$"]), None
491
-
492
- df = table.dropna().copy()
493
  df["ticker"] = df["ticker"].astype(str).str.upper().str.strip()
494
  df["amount_usd"] = pd.to_numeric(df["amount_usd"], errors="coerce").fillna(0.0)
495
 
496
  symbols = [t for t in df["ticker"].tolist() if t]
497
- symbols = validate_tickers(symbols, lookback)
498
  if len(symbols) == 0:
499
- return None, "Could not validate any tickers", "Universe invalid", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=["ticker","weight_%","dollars_$"]), None
500
 
501
- global UNIVERSE
502
- UNIVERSE = list(sorted(set([s for s in symbols])))[:MAX_TICKERS]
 
 
 
503
 
504
- # amounts & gross (gross = sum of absolute exposures)
505
- amounts = {r["ticker"]: float(r["amount_usd"]) for _, r in df.iterrows() if r["ticker"] in UNIVERSE}
506
- gross = float(sum(abs(v) for v in amounts.values()))
507
- rf_ann = RF_ANN
 
 
508
 
509
- # aligned moments
510
- moms = estimate_all_moments_aligned(UNIVERSE, lookback, rf_ann)
511
  betas, covA, erp_ann, sigma_mkt = moms["betas"], moms["cov_ann"], moms["erp_ann"], moms["sigma_m_ann"]
 
512
 
513
- if gross <= 1e-12:
514
- return None, "All amounts are zero", f"Universe set to: {', '.join(UNIVERSE)}", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=["ticker","weight_%","dollars_$"]), None
515
 
516
- weights = {k: v / gross for k, v in amounts.items()}
517
- beta_p, er_p, sigma_p = portfolio_stats(weights, covA, betas, rf_ann, erp_ann)
 
 
 
 
518
 
519
- a_sigma, b_sigma, mu_eff_sigma = efficient_same_sigma(sigma_p, rf_ann, erp_ann, sigma_mkt)
520
- a_mu, b_mu, sigma_eff_mu = efficient_same_return(er_p, rf_ann, erp_ann, sigma_mkt)
 
521
 
522
- # build (or reuse) synthetic dataset for this universe
523
- csv_path = None
524
- # make a stable filename per-universe
525
- sig = hashlib.md5((",".join(sorted(UNIVERSE)) + f":{lookback}:{RF_CODE}").encode()).hexdigest()[:8]
526
- candidate_path = os.path.join(DATA_DIR, f"investor_profiles_{sig}.csv")
527
- if os.path.exists(candidate_path):
528
- csv_path = candidate_path
529
- else:
530
- synth_df = build_synthetic_dataset(UNIVERSE, years=lookback, rf_ann=rf_ann, erp_ann=erp_ann)
531
- csv_path = save_synth_csv(synth_df, UNIVERSE)
532
-
533
- # dataset-based suggestion by risk
534
- sug = pick_by_risk_from_dataset(csv_path, UNIVERSE, risk_label=risk_label, use_embeddings=use_embeddings)
535
- suggestion_df = pd.DataFrame(columns=["ticker","weight_%","dollars_$"])
536
- sugg_sigma_plot = None
537
- sugg_mu_plot = None
538
- if sug is not None:
539
- suggestion_df = build_simple_suggestion_table(sug["weights"], gross_capital=gross)
540
- sugg_sigma_plot = sug["sigma"]
541
- sugg_mu_plot = sug["er"]
542
-
543
- # positions table (computed from user's inputs)
544
- rows = []
545
- for t in UNIVERSE:
546
- if t in amounts:
547
- beta_val = 1.0 if t == moms["mkt"] else betas.get(t, np.nan)
548
- rows.append({
549
- "ticker": t,
550
- "amount_usd": float(amounts.get(t, 0.0)),
551
- "weight_exposure": float(weights.get(t, 0.0)),
552
- "beta": float(beta_val),
553
- })
554
- pos_table = pd.DataFrame(rows, columns=POS_COLS)
555
-
556
- # plot & summary
557
- img = plot_cml(
558
- rf_ann, erp_ann, sigma_mkt,
559
  sigma_p, er_p,
560
  sigma_p, mu_eff_sigma,
561
  sigma_eff_mu, er_p,
562
- sugg_sigma=sugg_sigma_plot, sugg_mu=sugg_mu_plot
563
  )
564
 
565
- info = build_summary_md(
566
- lookback, HORIZON_YEARS, rf_ann, RF_CODE, erp_ann, sigma_mkt,
567
- beta_p, er_p, sigma_p,
568
- a_sigma, b_sigma, mu_eff_sigma,
569
- a_mu, b_mu, sigma_eff_mu,
570
- sugg=sug, risk_label=risk_label
571
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
 
573
- uni_msg = f"Universe set to: {', '.join(UNIVERSE)}"
574
- return img, info, uni_msg, pos_table, suggestion_df, csv_path
575
 
576
- # -------------- UI --------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
  with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
 
 
 
 
 
 
 
 
 
 
 
578
  gr.Markdown(
579
  "## Efficient Portfolio Advisor\n"
580
- "Search symbols, enter dollar amounts, set your horizon. Prices from Yahoo Finance. Risk-free from FRED. "
581
- "Low/Medium/High suggestions are chosen only from a 1,000-row dataset generated from your current universe, "
582
  "optionally refined with finance embeddings."
583
  )
584
 
585
  with gr.Row():
586
  with gr.Column(scale=1):
 
587
  q = gr.Textbox(label="Search symbol")
588
- search_note = gr.Markdown()
589
  matches = gr.Dropdown(choices=[], label="Matches")
590
  with gr.Row():
591
  search_btn = gr.Button("Search")
592
  add_btn = gr.Button("Add selected to portfolio")
593
 
 
594
  gr.Markdown("### Portfolio positions — type dollar amounts (negatives allowed for shorts)")
595
  table = gr.Dataframe(
596
  headers=["ticker", "amount_usd"],
597
  datatype=["str", "number"],
598
  row_count=0,
599
  col_count=(2, "fixed"),
600
- wrap=True,
601
  )
602
 
 
603
  horizon = gr.Number(label="Horizon in years (1–100)", value=HORIZON_YEARS, precision=0)
604
  lookback = gr.Slider(1, 10, value=DEFAULT_LOOKBACK_YEARS, step=1, label="Lookback years for beta & sigma")
605
 
 
606
  gr.Markdown("### Suggestions")
607
- risk = gr.Radio(choices=["Low", "Medium", "High"], value="Medium", label="Risk tolerance")
608
- use_emb = gr.Checkbox(label="Use finance embeddings to refine picks", value=True)
609
 
610
  run_btn = gr.Button("Compute (build dataset & suggest)")
611
 
612
  with gr.Column(scale=1):
613
  plot = gr.Image(label="Capital Market Line (CML)", type="pil")
614
- summary = gr.Markdown(label="Inputs & Results")
615
  universe_msg = gr.Textbox(label="Universe status", interactive=False)
616
-
617
  positions = gr.Dataframe(
618
  label="Computed positions",
619
  headers=POS_COLS,
@@ -622,18 +636,17 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
622
  value=pd.DataFrame(columns=POS_COLS),
623
  interactive=False
624
  )
625
-
626
  suggestions = gr.Dataframe(
627
- label="Suggested holdings (weights are % of gross capital; negatives = shorts)",
628
- headers=["ticker", "weight_%", "dollars_$"],
629
- datatype=["str", "number", "number"],
630
- col_count=(3, "fixed"),
631
- value=pd.DataFrame(columns=["ticker","weight_%","dollars_$"]),
632
  interactive=False
633
  )
634
-
635
  dl = gr.File(label="Generated dataset CSV", value=None, visible=True)
636
 
 
637
  def do_search(query):
638
  note, options = search_tickers_cb(query)
639
  return note, gr.update(choices=options)
@@ -645,7 +658,7 @@ with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
645
 
646
  run_btn.click(
647
  fn=compute,
648
- inputs=[lookback, table, risk, use_emb],
649
  outputs=[plot, summary, universe_msg, positions, suggestions, dl]
650
  )
651
 
 
 
1
  # app.py
2
+ # Efficient Portfolio Advisor with dataset-based Low/Medium/High suggestions
3
+ # Modality: Text. Models: yfinance (prices), FRED (risk-free), simple CAPM math,
4
+ # optional reranking with sentence-transformers "FinLang/finance-embeddings-investopedia".
5
+
6
+ import os
7
+ import io
8
+ import math
9
+ import json
10
+ import warnings
11
  warnings.filterwarnings("ignore")
12
 
13
  from typing import List, Tuple, Dict, Optional
 
15
  import numpy as np
16
  import pandas as pd
17
  import matplotlib.pyplot as plt
 
18
  from PIL import Image
19
  import requests
20
+ import gradio as gr
21
  import yfinance as yf
22
 
23
+ # Optional (lazy) import for embeddings
24
+ _ST_MODEL = None
25
 
26
+ # ---------- Config ----------
27
  DATA_DIR = "data"
28
  os.makedirs(DATA_DIR, exist_ok=True)
29
 
30
+ MARKET_TICKER = "VOO"
31
  MAX_TICKERS = 30
32
  DEFAULT_LOOKBACK_YEARS = 10
33
+ DATASET_ROWS = 1000
34
 
35
+ # FRED mappings by horizon
 
 
 
36
  FRED_MAP = [
37
  (1, "DGS1"),
38
  (2, "DGS2"),
 
45
  (100, "DGS30"),
46
  ]
47
 
48
+ POS_COLS = ["ticker", "amount_usd", "weight_exposure", "beta"]
49
+ SUG_COLS_HOLD = ["pick", "ticker", "weight_%", "amount_$"]
50
+
51
+ # ---------- Small helpers ----------
52
+ def fmt_pct(x: float, dec: int = 2) -> str:
53
+ try:
54
+ return f"{x*100:.{dec}f}%"
55
+ except Exception:
56
+ return "—"
57
+
58
+ def fmt_usd(x: float) -> str:
59
+ try:
60
+ return f"${x:,.2f}"
61
+ except Exception:
62
+ return "—"
63
+
64
+ def ensure_dir(p: str):
65
+ os.makedirs(os.path.dirname(p), exist_ok=True)
66
+
67
  def fred_series_for_horizon(years: float) -> str:
68
  y = max(1.0, min(100.0, float(years)))
69
  for cutoff, code in FRED_MAP:
 
82
  except Exception:
83
  return 0.03
84
 
85
+ # ---------- Prices & returns (fix for 'Close' KeyError) ----------
86
  def fetch_prices_monthly(tickers: List[str], years: int) -> pd.DataFrame:
87
  start = pd.Timestamp.today(tz="UTC") - pd.DateOffset(years=years, days=7)
88
  end = pd.Timestamp.today(tz="UTC")
89
+
90
+ raw = yf.download(
91
  list(dict.fromkeys(tickers)),
92
  start=start.date(),
93
  end=end.date(),
94
  interval="1mo",
95
+ auto_adjust=False, # prefer 'Adj Close' if present
96
  progress=False,
97
  group_by="ticker",
98
+ threads=False,
99
+ )
100
+ if raw is None or len(raw) == 0:
101
+ return pd.DataFrame()
102
+
103
+ # MultiIndex (ticker, field) vs single-index
104
+ if isinstance(raw.columns, pd.MultiIndex):
105
+ price = None
106
+ for field in ("Adj Close", "Close"):
107
+ if field in raw.columns.get_level_values(-1):
108
+ price = raw.xs(field, axis=1, level=-1, drop_level=True)
109
+ break
110
+ if price is None:
111
+ price = raw.copy()
112
+ price.columns = [c[0] if isinstance(c, tuple) else c for c in price.columns]
113
+ else:
114
+ if "Adj Close" in raw.columns:
115
+ price = raw["Adj Close"]
116
+ elif "Close" in raw.columns:
117
+ price = raw["Close"]
118
+ else:
119
+ price = raw
120
+
121
+ if isinstance(price, pd.Series):
122
+ price = price.to_frame()
123
+
124
+ price = price.dropna(how="all").fillna(method="ffill")
125
+ price = price.loc[:, ~pd.Index(price.columns).duplicated()]
126
+ return price
127
 
128
  def monthly_returns(prices: pd.DataFrame) -> pd.DataFrame:
129
  return prices.pct_change().dropna()
130
 
131
+ def annualize_mean(m):
132
+ return np.asarray(m, dtype=float) * 12.0
133
+
134
+ def annualize_sigma(s):
135
+ return np.asarray(s, dtype=float) * math.sqrt(12.0)
136
+
137
+ # ---------- Search & validation ----------
138
  def yahoo_search(query: str):
139
+ if not query or not query.strip():
140
  return []
141
  url = "https://query1.finance.yahoo.com/v1/finance/search"
142
  params = {"q": query.strip(), "quotesCount": 10, "newsCount": 0}
 
153
  if sym and sym.isascii():
154
  out.append({"symbol": sym, "name": name, "exchange": exch})
155
  if not out:
156
+ out = [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": ""}]
157
  return out[:10]
158
  except Exception:
159
+ return [{"symbol": query.strip().upper(), "name": "typed symbol", "exchange": ""}]
160
 
161
  def validate_tickers(symbols: List[str], years: int) -> List[str]:
162
+ # include market to keep alignment, but validate only user symbols
163
+ base = list(dict.fromkeys([s.strip().upper() for s in symbols if s.strip()]))[:MAX_TICKERS]
 
 
164
  px = fetch_prices_monthly(base + [MARKET_TICKER], years)
165
  ok = [s for s in base if s in px.columns]
 
 
 
 
 
166
  return ok
167
 
168
+ # ---------- Aligned CAPM moments ----------
169
  def get_aligned_monthly_returns(symbols: List[str], years: int) -> pd.DataFrame:
170
+ uniq = [c for c in dict.fromkeys(symbols) if c != MARKET_TICKER]
171
  tickers = uniq + [MARKET_TICKER]
172
  px = fetch_prices_monthly(tickers, years)
 
 
 
 
173
  rets = monthly_returns(px)
174
+ cols = [c for c in uniq if c in rets.columns] + ([MARKET_TICKER] if MARKET_TICKER in rets.columns else [])
175
  R = rets[cols].dropna(how="any")
176
+ return R.loc[:, ~R.columns.duplicated()]
 
 
 
 
 
 
177
 
178
  def estimate_all_moments_aligned(symbols: List[str], years: int, rf_ann: float):
179
+ R = get_aligned_monthly_returns(symbols, years)
180
+ if MARKET_TICKER not in R.columns or R.shape[0] < 3:
181
+ raise ValueError("Not enough aligned data to estimate moments.")
182
  rf_m = rf_ann / 12.0
183
 
184
+ m = R[MARKET_TICKER]
185
  if isinstance(m, pd.DataFrame):
186
  m = m.iloc[:, 0].squeeze()
187
 
 
194
  var_m = max(var_m, 1e-6)
195
 
196
  betas: Dict[str, float] = {}
197
+ for s in [c for c in R.columns if c != MARKET_TICKER]:
198
  ex_s = R[s] - rf_m
199
  betas[s] = float(np.cov(ex_s.values, ex_m.values, ddof=1)[0, 1] / var_m)
200
+ betas[MARKET_TICKER] = 1.0
201
 
202
+ asset_cols = [c for c in R.columns if c != MARKET_TICKER]
 
 
203
  cov_m = np.cov(R[asset_cols].values.T, ddof=1) if asset_cols else np.zeros((0, 0))
204
  covA = pd.DataFrame(cov_m * 12.0, index=asset_cols, columns=asset_cols)
205
 
206
+ return {"betas": betas, "cov_ann": covA, "erp_ann": erp_ann, "sigma_m_ann": sigma_m_ann}
207
 
208
  def capm_er(beta: float, rf_ann: float, erp_ann: float) -> float:
209
  return float(rf_ann + beta * erp_ann)
 
214
  rf_ann: float,
215
  erp_ann: float) -> Tuple[float, float, float]:
216
  tickers = list(weights.keys())
217
+ if len(tickers) == 0:
218
+ return 0.0, 0.0, 0.0
219
  w = np.array([weights[t] for t in tickers], dtype=float)
220
  gross = float(np.sum(np.abs(w)))
221
+ if gross <= 1e-12:
222
  return 0.0, 0.0, 0.0
223
  w_expo = w / gross
224
  beta_p = float(np.dot([betas.get(t, 0.0) for t in tickers], w_expo))
225
  er_p = capm_er(beta_p, rf_ann, erp_ann)
226
  cov = cov_ann.reindex(index=tickers, columns=tickers).fillna(0.0).to_numpy()
227
+ sigma_p = math.sqrt(float(max(w_expo.T @ cov @ w_expo, 0.0)))
 
228
  return beta_p, er_p, sigma_p
229
 
230
+ # ---------- CML helpers (plot in %) ----------
231
  def efficient_same_sigma(sigma_target: float, rf_ann: float, erp_ann: float, sigma_mkt: float):
232
  if sigma_mkt <= 1e-12:
233
  return 0.0, 1.0, rf_ann
 
240
  a = (mu_target - rf_ann) / erp_ann
241
  return a, 1.0 - a, abs(a) * sigma_mkt
242
 
243
+ def plot_cml_percent(
244
  rf_ann, erp_ann, sigma_mkt,
245
  pt_sigma, pt_mu,
246
  same_sigma_sigma, same_sigma_mu,
247
  same_mu_sigma, same_mu_mu,
248
+ suggestion_sigma=None, suggestion_mu=None
249
  ) -> Image.Image:
250
+ fig = plt.figure(figsize=(6, 4), dpi=120)
251
 
252
  xmax = max(
253
+ 0.3,
254
  sigma_mkt * 2.0,
255
  pt_sigma * 1.4,
256
  same_mu_sigma * 1.4,
257
  same_sigma_sigma * 1.4,
258
+ (suggestion_sigma or 0.0) * 1.4,
259
  )
260
  xs = np.linspace(0, xmax, 160)
261
  slope = erp_ann / max(sigma_mkt, 1e-12)
262
  cml = rf_ann + slope * xs
263
+ plt.plot(xs * 100, cml * 100, label="CML via Market")
264
+
265
+ # Points
266
+ plt.scatter([0.0], [rf_ann * 100], label="Risk-free (FRED)")
267
+ plt.scatter([sigma_mkt * 100], [(rf_ann + erp_ann) * 100], label="Market VOO")
268
+ plt.scatter([pt_sigma * 100], [pt_mu * 100], label="Your portfolio")
269
+ plt.scatter([same_sigma_sigma * 100], [same_sigma_mu * 100], label="Efficient same sigma")
270
+ plt.scatter([same_mu_sigma * 100], [same_mu_mu * 100], label="Efficient same return")
271
+ if suggestion_sigma is not None and suggestion_mu is not None:
272
+ plt.scatter([suggestion_sigma * 100], [suggestion_mu * 100], label="Suggestion")
273
+
274
+ # simple dotted guides
275
+ plt.plot([pt_sigma * 100, same_sigma_sigma * 100], [pt_mu * 100, same_sigma_mu * 100], linestyle="--", lw=1, alpha=0.7, color="gray")
276
+ plt.plot([pt_sigma * 100, same_mu_sigma * 100], [pt_mu * 100, same_mu_mu * 100], linestyle="--", lw=1, alpha=0.7, color="gray")
277
+
278
+ plt.xlabel("σ (annualized, %)")
279
+ plt.ylabel("Expected return (annual, %)")
 
 
 
 
 
 
 
 
 
 
 
 
280
  plt.legend(loc="best", fontsize=8)
281
  plt.tight_layout()
282
 
 
286
  buf.seek(0)
287
  return Image.open(buf)
288
 
289
+ # ---------- Synthetic dataset (built only from current universe) ----------
290
+ def _row_exposures(row: pd.Series, universe: List[str]) -> Optional[np.ndarray]:
291
  try:
292
+ ts = [t.strip() for t in str(row["tickers"]).split(",")]
293
  ws = [float(x) for x in str(row["weights"]).split(",")]
294
  wmap = {t: ws[i] for i, t in enumerate(ts) if i < len(ws)}
295
  w = np.array([wmap.get(t, 0.0) for t in universe], dtype=float)
 
300
  except Exception:
301
  return None
302
 
303
+ def build_synthetic_dataset(universe: List[str], years: int, rf_ann: float, erp_ann: float, n_rows: int = DATASET_ROWS) -> pd.DataFrame:
304
+ # require MARKET_TICKER present for moments; weights exclude it unless random pick includes
305
+ moms = estimate_all_moments_aligned(universe, years, rf_ann)
306
  covA, betas = moms["cov_ann"], moms["betas"]
307
 
308
+ rng = np.random.default_rng(12345)
309
+ rows = []
310
+ for i in range(n_rows):
311
+ k = int(rng.integers(low=min(2, len(universe)), high=min(8, len(universe)) + 1))
312
+ picks = list(rng.choice(universe, size=k, replace=False))
313
+ signs = rng.choice([-1.0, 1.0], size=k, p=[0.2, 0.8])
314
  raw = rng.dirichlet(np.ones(k))
315
+ gross = 1.0 + float(rng.gamma(2.0, 0.5))
316
  w = gross * signs * raw
317
+ # portfolio stats
318
  beta_p, er_p, sigma_p = portfolio_stats({picks[j]: w[j] for j in range(k)}, covA, betas, rf_ann, erp_ann)
319
  rows.append({
320
  "id": i,
321
  "tickers": ",".join(picks),
322
  "weights": ",".join(f"{x:.6f}" for x in w),
 
323
  "er_p": er_p,
324
+ "sigma_p": sigma_p,
325
+ "beta_p": beta_p
326
  })
327
  return pd.DataFrame(rows)
328
 
329
+ def dataset_path_for_universe(universe: List[str]) -> str:
330
+ key = ",".join(sorted(universe))
331
+ h = abs(hash(key)) % (10**8)
332
+ p = os.path.join(DATA_DIR, f"investor_profiles_{h}.csv")
333
+ return p
334
+
335
+ # ---------- Suggestion logic (Low / Medium / High) ----------
336
+ def _risk_targets(sigmas: np.ndarray) -> Dict[str, float]:
337
+ # choose targets by quantiles of dataset sigma
338
+ return {
339
+ "Low": float(np.quantile(sigmas, 0.15)),
340
+ "Medium": float(np.quantile(sigmas, 0.50)),
341
+ "High": float(np.quantile(sigmas, 0.85)),
342
+ }
343
+
344
+ def _describe_row_for_embeddings(row: pd.Series, universe: List[str]) -> str:
345
+ # text description for semantic reranking
346
+ parts = [f"sigma {row['sigma_p']:.4f}", f"beta {row['beta_p']:.2f}", f"expected return {row['er_p']:.4f}"]
347
+ ex = _row_exposures(row, universe)
348
+ if ex is not None:
349
+ top = sorted([(universe[i], float(abs(ex[i]))) for i in range(len(universe))], key=lambda kv: -kv[1])[:4]
350
+ parts.append("focus on " + ", ".join([f"{t}:{w:.2f}" for t, w in top]))
351
+ return " ".join(parts)
352
+
353
+ def _get_prompt(risk_level: str) -> str:
354
+ if risk_level == "Low":
355
+ return "low risk, stable, conservative diversified portfolio"
356
+ if risk_level == "High":
357
+ return "high risk, growth oriented, aggressive portfolio"
358
+ return "balanced moderate risk diversified portfolio"
359
+
360
+ def _maybe_load_st_model():
361
+ global _ST_MODEL
362
+ if _ST_MODEL is None:
363
+ from sentence_transformers import SentenceTransformer
364
+ _ST_MODEL = SentenceTransformer("FinLang/finance-embeddings-investopedia")
365
+ return _ST_MODEL
366
+
367
+ def suggest_from_dataset(csv_path: str,
368
+ universe: List[str],
369
+ total_amount: float,
370
+ risk_level: str,
371
+ use_embeddings: bool = False):
372
  try:
373
  df = pd.read_csv(csv_path)
374
  except Exception:
375
+ return pd.DataFrame(columns=SUG_COLS_HOLD), None
376
+
377
  if df.empty:
378
+ return pd.DataFrame(columns=SUG_COLS_HOLD), None
379
 
380
+ sigmas = df["sigma_p"].to_numpy(dtype=float)
381
+ targets = _risk_targets(sigmas)
382
+ target_sigma = targets.get(risk_level, targets["Medium"])
 
 
 
 
 
 
 
 
 
 
383
 
384
+ # distance to target sigma
385
+ df = df.copy()
386
+ df["dist"] = (df["sigma_p"] - target_sigma).abs()
387
+
388
+ # Take a reasonable candidate pool
389
+ cand = df.nsmallest(100, "dist").reset_index(drop=True)
390
 
391
+ # Optional semantic rerank
392
  if use_embeddings:
393
+ model = _maybe_load_st_model()
394
+ prompt = _get_prompt(risk_level)
395
+ texts = [prompt] + [ _describe_row_for_embeddings(r, universe) for _, r in cand.iterrows() ]
396
+ embs = model.encode(texts)
397
+ S = model.similarity(embs[0:1], embs[1:]).flatten() # cosine similarity
398
+ cand = cand.assign(sim=S).sort_values("sim", ascending=False).head(50).reset_index(drop=True)
399
+
400
+ # Now pick the top 3 by a combined score (distance, then ER desc)
401
+ cand["score"] = cand["dist"] - 0.2 * cand["er_p"] # small bias toward higher ER
402
+ picks = cand.nsmallest(3, "score").reset_index(drop=True)
403
+
404
+ # Build a simple holdings table: percent and dollars
405
+ hold_rows = []
406
+ first_pick_mu = None
407
+ first_pick_sigma = None
408
+ for i, row in picks.iterrows():
409
+ expo = _row_exposures(row, universe)
410
+ if expo is None:
411
+ continue
412
+ if first_pick_mu is None:
413
+ first_pick_mu = float(row["er_p"])
414
+ first_pick_sigma = float(row["sigma_p"])
415
+ wmap = {universe[j]: float(expo[j]) for j in range(len(universe)) if abs(float(expo[j])) > 1e-4}
416
+ for t, w in sorted(wmap.items(), key=lambda kv: -abs(kv[1]))[:12]:
417
+ hold_rows.append({
418
+ "pick": i + 1,
419
+ "ticker": t,
420
+ "weight_%": round(w * 100.0, 2),
421
+ "amount_$": round(w * total_amount, 2)
422
+ })
423
 
424
+ hold_df = pd.DataFrame(hold_rows, columns=SUG_COLS_HOLD)
425
+ return hold_df, (first_pick_mu, first_pick_sigma)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
 
427
+ # ---------- UI callbacks ----------
428
  def search_tickers_cb(q: str):
429
  hits = yahoo_search(q)
430
  if not hits:
 
434
 
435
  def add_symbol(selection: str, table: pd.DataFrame):
436
  if not selection:
437
+ return table, "Pick a row from Matches first."
438
  symbol = selection.split("|")[0].strip().upper()
439
  current = [] if table is None or len(table) == 0 else [str(x).upper() for x in table["ticker"].tolist() if str(x) != "nan"]
440
  tickers = current if symbol in current else current + [symbol]
441
  val = validate_tickers(tickers, years=DEFAULT_LOOKBACK_YEARS)
442
  tickers = [t for t in tickers if t in val]
443
+ # preserve amounts
444
  amt_map = {}
445
  if table is not None and len(table) > 0:
446
  for _, r in table.iterrows():
 
464
  amounts = amounts[:len(tickers)] + [0.0] * max(0, len(tickers) - len(amounts))
465
  return pd.DataFrame({"ticker": tickers, "amount_usd": amounts})
466
 
467
+ # Global horizon & rf on change (persisted during session)
468
+ HORIZON_YEARS = 10
469
+ RF_CODE = fred_series_for_horizon(HORIZON_YEARS)
470
+ RF_ANN = fetch_fred_yield_annual(RF_CODE)
471
+
472
  def set_horizon(years: float):
473
  y = max(1.0, min(100.0, float(years)))
474
  code = fred_series_for_horizon(y)
 
477
  HORIZON_YEARS = y
478
  RF_CODE = code
479
  RF_ANN = rf
480
+ return f"Risk-free series {code}. Latest annual rate {fmt_pct(rf)}. Horizon set to {int(round(y))} years."
481
 
482
+ def compute(lookback_years: int,
483
  table: pd.DataFrame,
484
+ risk_level: str,
485
  use_embeddings: bool):
486
+ # ---- read table
487
+ df = table.dropna()
 
 
 
488
  df["ticker"] = df["ticker"].astype(str).str.upper().str.strip()
489
  df["amount_usd"] = pd.to_numeric(df["amount_usd"], errors="coerce").fillna(0.0)
490
 
491
  symbols = [t for t in df["ticker"].tolist() if t]
 
492
  if len(symbols) == 0:
493
+ return None, "Add at least one ticker.", "", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=SUG_COLS_HOLD), None
494
 
495
+ symbols = validate_tickers(symbols, lookback_years)
496
+ if len(symbols) == 0:
497
+ return None, "Could not validate any tickers.", "—", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=SUG_COLS_HOLD), None
498
+
499
+ universe = list(sorted(set([s for s in symbols if s != MARKET_TICKER] + [MARKET_TICKER])))[:MAX_TICKERS]
500
 
501
+ df = df[df["ticker"].isin(symbols)].copy()
502
+ amounts = {r["ticker"]: float(r["amount_usd"]) for _, r in df.iterrows()}
503
+ total_amt = float(sum(abs(v) for v in amounts.values()))
504
+ if total_amt <= 1e-12:
505
+ return None, "All amounts are zero.", f"Universe set to {', '.join(universe)}", pd.DataFrame(columns=POS_COLS), pd.DataFrame(columns=SUG_COLS_HOLD), None
506
+ weights = {k: v / total_amt for k, v in amounts.items()}
507
 
508
+ # ---- moments & portfolio metrics
509
+ moms = estimate_all_moments_aligned(universe, lookback_years, RF_ANN)
510
  betas, covA, erp_ann, sigma_mkt = moms["betas"], moms["cov_ann"], moms["erp_ann"], moms["sigma_m_ann"]
511
+ beta_p, er_p, sigma_p = portfolio_stats(weights, covA, betas, RF_ANN, erp_ann)
512
 
513
+ a_sigma, b_sigma, mu_eff_sigma = efficient_same_sigma(sigma_p, RF_ANN, erp_ann, sigma_mkt)
514
+ a_mu, b_mu, sigma_eff_mu = efficient_same_return(er_p, RF_ANN, erp_ann, sigma_mkt)
515
 
516
+ # ---- dataset build (only for current universe)
517
+ csv_path = dataset_path_for_universe(universe)
518
+ if not os.path.exists(csv_path):
519
+ synth = build_synthetic_dataset(universe, lookback_years, RF_ANN, erp_ann, n_rows=DATASET_ROWS)
520
+ ensure_dir(csv_path)
521
+ synth.to_csv(csv_path, index=False)
522
 
523
+ # ---- dataset-based suggestions (simple table: percent & dollars)
524
+ hold_df, first_pick_pt = suggest_from_dataset(csv_path, universe, total_amt, risk_level, use_embeddings)
525
+ sug_mu, sug_sigma = (first_pick_pt if first_pick_pt is not None else (None, None))
526
 
527
+ # ---- plot
528
+ img = plot_cml_percent(
529
+ RF_ANN, erp_ann, sigma_mkt,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
  sigma_p, er_p,
531
  sigma_p, mu_eff_sigma,
532
  sigma_eff_mu, er_p,
533
+ suggestion_sigma=sug_sigma, suggestion_mu=sug_mu
534
  )
535
 
536
+ # ---- summary (percent everywhere)
537
+ info_lines = []
538
+ info_lines.append("### Inputs")
539
+ info_lines.append(f"- Lookback years {int(lookback_years)}")
540
+ info_lines.append(f"- Horizon years {int(round(HORIZON_YEARS))}")
541
+ info_lines.append(f"- Risk-free {fmt_pct(RF_ANN)} from {RF_CODE}")
542
+ info_lines.append(f"- Market ERP {fmt_pct(erp_ann)}")
543
+ info_lines.append(f"- Market σ {fmt_pct(sigma_mkt)}")
544
+ info_lines.append("")
545
+ info_lines.append("### Your portfolio (CAPM expectations)")
546
+ info_lines.append(f"- Beta {beta_p:.2f}")
547
+ info_lines.append(f"- σ {fmt_pct(sigma_p)}")
548
+ info_lines.append(f"- Expected return {fmt_pct(er_p)}")
549
+ info_lines.append("")
550
+ info_lines.append("### Efficient alternatives on CML")
551
+ info_lines.append(f"- Same σ as your portfolio → Market weight {a_sigma:.2f}, Bills weight {b_sigma:.2f}, return {fmt_pct(mu_eff_sigma)}")
552
+ info_lines.append(f"- Same expected return → Market weight {a_mu:.2f}, Bills weight {b_mu:.2f}, σ {fmt_pct(sigma_eff_mu)}")
553
+ info_lines.append("")
554
+ info_lines.append(f"### Dataset-based suggestions (risk: {risk_level})")
555
+ info_lines.append("- Shown below as simple holdings: percent of exposure and dollars allocated.")
556
+ if use_embeddings:
557
+ info_lines.append("- Reranked with finance embeddings (FinLang/finance-embeddings-investopedia).")
558
 
559
+ info = "\n".join(info_lines)
 
560
 
561
+ # ---- positions table for current inputs
562
+ rows = []
563
+ for t in symbols:
564
+ beta_val = 1.0 if t == MARKET_TICKER else betas.get(t, np.nan)
565
+ rows.append({
566
+ "ticker": t,
567
+ "amount_usd": round(amounts.get(t, 0.0), 2),
568
+ "weight_exposure": round(weights.get(t, 0.0), 6),
569
+ "beta": round(beta_val, 6),
570
+ })
571
+ pos_table = pd.DataFrame(rows, columns=POS_COLS)
572
+
573
+ uni_msg = f"Universe set to: {', '.join(universe)}"
574
+ return img, info, uni_msg, pos_table, hold_df, csv_path
575
+
576
+ # ---------- UI ----------
577
  with gr.Blocks(title="Efficient Portfolio Advisor") as demo:
578
+ with gr.Accordion("About (assignment section 1)", open=False):
579
+ gr.Markdown(
580
+ "**Modality**: Text.\n\n"
581
+ "**Use case**: Given a user’s stock/ETF universe and current dollar amounts, the system recommends three "
582
+ "alternative mixes (Low / Medium / High risk) generated from a 1,000-row dataset of random portfolios built "
583
+ "only from the user’s current universe.\n\n"
584
+ "**System goal**: User provides text inputs (tickers and amounts). The system returns three similar items "
585
+ "(suggested mixes) from the dataset. Optional reranking uses the text-embedding model "
586
+ "`FinLang/finance-embeddings-investopedia`."
587
+ )
588
+
589
  gr.Markdown(
590
  "## Efficient Portfolio Advisor\n"
591
+ "Search symbols, enter dollar amounts, set your horizon. Prices from Yahoo Finance. "
592
+ "Risk-free from FRED. Low/Medium/High suggestions are chosen only from a 1,000-row dataset generated from your current universe, "
593
  "optionally refined with finance embeddings."
594
  )
595
 
596
  with gr.Row():
597
  with gr.Column(scale=1):
598
+ # search
599
  q = gr.Textbox(label="Search symbol")
600
+ search_note = gr.Markdown(" ")
601
  matches = gr.Dropdown(choices=[], label="Matches")
602
  with gr.Row():
603
  search_btn = gr.Button("Search")
604
  add_btn = gr.Button("Add selected to portfolio")
605
 
606
+ # portfolio table
607
  gr.Markdown("### Portfolio positions — type dollar amounts (negatives allowed for shorts)")
608
  table = gr.Dataframe(
609
  headers=["ticker", "amount_usd"],
610
  datatype=["str", "number"],
611
  row_count=0,
612
  col_count=(2, "fixed"),
613
+ value=pd.DataFrame(columns=["ticker", "amount_usd"])
614
  )
615
 
616
+ # horizon & lookback
617
  horizon = gr.Number(label="Horizon in years (1–100)", value=HORIZON_YEARS, precision=0)
618
  lookback = gr.Slider(1, 10, value=DEFAULT_LOOKBACK_YEARS, step=1, label="Lookback years for beta & sigma")
619
 
620
+ # suggestions controls
621
  gr.Markdown("### Suggestions")
622
+ risk = gr.Radio(["Low", "Medium", "High"], value="Medium", label="Risk tolerance")
623
+ use_st = gr.Checkbox(label="Use finance embeddings to refine picks", value=False)
624
 
625
  run_btn = gr.Button("Compute (build dataset & suggest)")
626
 
627
  with gr.Column(scale=1):
628
  plot = gr.Image(label="Capital Market Line (CML)", type="pil")
629
+ summary = gr.Markdown(label="Summary")
630
  universe_msg = gr.Textbox(label="Universe status", interactive=False)
 
631
  positions = gr.Dataframe(
632
  label="Computed positions",
633
  headers=POS_COLS,
 
636
  value=pd.DataFrame(columns=POS_COLS),
637
  interactive=False
638
  )
 
639
  suggestions = gr.Dataframe(
640
+ label="Dataset-based suggestions (top 3 holdings shown as % and $)",
641
+ headers=SUG_COLS_HOLD,
642
+ datatype=["number", "str", "number", "number"],
643
+ col_count=(len(SUG_COLS_HOLD), "fixed"),
644
+ value=pd.DataFrame(columns=SUG_COLS_HOLD),
645
  interactive=False
646
  )
 
647
  dl = gr.File(label="Generated dataset CSV", value=None, visible=True)
648
 
649
+ # wiring
650
  def do_search(query):
651
  note, options = search_tickers_cb(query)
652
  return note, gr.update(choices=options)
 
658
 
659
  run_btn.click(
660
  fn=compute,
661
+ inputs=[lookback, table, risk, use_st],
662
  outputs=[plot, summary, universe_msg, positions, suggestions, dl]
663
  )
664