wenjun99 commited on
Commit
df3d2a3
Β·
verified Β·
1 Parent(s): a38ee9b

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +60 -14
src/app.py CHANGED
@@ -374,38 +374,84 @@ with tab3:
374
  st.markdown("### 1️⃣ Raw Data Distribution")
375
  st.caption("Visualize editing values across all positions and samples β€” before any binary labelling.")
376
 
377
- log_toggle = st.checkbox("Apply log1p transformation to values", value=False, key="log_toggle")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
 
379
  # Melt data to long format: (sample, position_index, value)
380
- melted = pos_data.melt(var_name="Position", value_name="Value")
381
  melted["Position_idx"] = melted["Position"].apply(
382
  lambda x: int(re.search(r"(\d+)", str(x)).group(1)) if re.search(r"(\d+)", str(x)) else 0
383
  )
384
- if log_toggle:
385
- melted["Value"] = np.log1p(melted["Value"])
386
- value_label = "Editing Value (log1p)"
387
- else:
388
- value_label = "Editing Value"
389
 
390
  # =====================================================
391
  # PLOT 2: Histogram β€” all values
392
  # =====================================================
393
  st.markdown("#### πŸ“Š Histogram β€” All Values")
394
 
395
- n_bins = st.slider("Number of bins:", min_value=20, max_value=200, value=80, key="hist_bins")
396
 
397
  fig2, ax2 = plt.subplots(figsize=(10, 4))
398
  ax2.hist(melted["Value"].values, bins=n_bins, color="#4F46E5", edgecolor="white", linewidth=0.3)
399
  ax2.set_xlabel(value_label)
400
  ax2.set_ylabel("Count")
401
- transform_label = "log1p" if log_toggle else "linear"
402
- ax2.set_title(f"Raw Values Distribution ({transform_label})")
403
- # Fine x-axis ticks: every 0.2 for log1p, every 5 for linear
404
  val_max = melted["Value"].max()
405
- if log_toggle:
406
- ax2.set_xticks(np.arange(0, val_max + 0.2, 0.2))
 
 
 
 
 
407
  else:
408
- ax2.set_xticks(np.arange(0, val_max + 5, 5))
 
 
409
  ax2.tick_params(axis='x', labelsize=8, rotation=45)
410
  ax2.grid(axis='y', alpha=0.3)
411
  fig2.tight_layout()
 
374
  st.markdown("### 1️⃣ Raw Data Distribution")
375
  st.caption("Visualize editing values across all positions and samples β€” before any binary labelling.")
376
 
377
+ transform_option = st.selectbox(
378
+ "Value transformation:",
379
+ ["Raw (linear)", "log1p", "log1p β†’ log1p", "log1p β†’ pos. norm."],
380
+ index=0,
381
+ key="transform_select",
382
+ help=(
383
+ "**Raw** β€” No transformation.\n\n"
384
+ "**log1p** β€” `log(1 + x)`. Compresses high values, spreads low range.\n\n"
385
+ "**log1p β†’ log1p** β€” Double log1p. Even stronger compression.\n\n"
386
+ "**log1p β†’ pos. norm.** β€” log1p then robust per-position normalization "
387
+ "(median / IQR scaling per position column)."
388
+ )
389
+ )
390
+
391
+ # --- Apply transforms ---
392
+ def robust_pos_normalize_log1p(data: pd.DataFrame) -> pd.DataFrame:
393
+ """log1p then robust per-position normalization (median + IQR)."""
394
+ logged = np.log1p(data)
395
+ result = logged.copy()
396
+ for col in result.columns:
397
+ med = result[col].median()
398
+ q75, q25 = result[col].quantile(0.75), result[col].quantile(0.25)
399
+ iqr = q75 - q25
400
+ if iqr > 0:
401
+ result[col] = (result[col] - med) / iqr
402
+ else:
403
+ result[col] = result[col] - med
404
+ return result
405
+
406
+ if transform_option == "log1p":
407
+ transformed = np.log1p(pos_data)
408
+ value_label = "Editing Value (log1p)"
409
+ transform_tag = "log1p"
410
+ elif transform_option == "log1p β†’ log1p":
411
+ transformed = np.log1p(np.log1p(pos_data))
412
+ value_label = "Editing Value (log1p β†’ log1p)"
413
+ transform_tag = "log1p_log1p"
414
+ elif transform_option == "log1p β†’ pos. norm.":
415
+ transformed = robust_pos_normalize_log1p(pos_data)
416
+ value_label = "Editing Value (log1p β†’ pos. norm.)"
417
+ transform_tag = "log1p_posnorm"
418
+ else:
419
+ transformed = pos_data
420
+ value_label = "Editing Value"
421
+ transform_tag = "raw"
422
 
423
  # Melt data to long format: (sample, position_index, value)
424
+ melted = transformed.melt(var_name="Position", value_name="Value")
425
  melted["Position_idx"] = melted["Position"].apply(
426
  lambda x: int(re.search(r"(\d+)", str(x)).group(1)) if re.search(r"(\d+)", str(x)) else 0
427
  )
 
 
 
 
 
428
 
429
  # =====================================================
430
  # PLOT 2: Histogram β€” all values
431
  # =====================================================
432
  st.markdown("#### πŸ“Š Histogram β€” All Values")
433
 
434
+ n_bins = st.number_input("Number of bins:", min_value=10, max_value=300, value=80, step=10, key="hist_bins")
435
 
436
  fig2, ax2 = plt.subplots(figsize=(10, 4))
437
  ax2.hist(melted["Value"].values, bins=n_bins, color="#4F46E5", edgecolor="white", linewidth=0.3)
438
  ax2.set_xlabel(value_label)
439
  ax2.set_ylabel("Count")
440
+ ax2.set_title(f"Raw Values Distribution ({transform_tag})")
441
+ # Fine x-axis ticks adapted to transform range
442
+ val_min = melted["Value"].min()
443
  val_max = melted["Value"].max()
444
+ val_range = val_max - val_min
445
+ if val_range <= 2:
446
+ tick_step = 0.1
447
+ elif val_range <= 6:
448
+ tick_step = 0.2
449
+ elif val_range <= 20:
450
+ tick_step = 1
451
  else:
452
+ tick_step = 5
453
+ ax2.set_xticks(np.arange(np.floor(val_min / tick_step) * tick_step,
454
+ val_max + tick_step, tick_step))
455
  ax2.tick_params(axis='x', labelsize=8, rotation=45)
456
  ax2.grid(axis='y', alpha=0.3)
457
  fig2.tight_layout()