Spaces:
Sleeping
Sleeping
Update src/app.py
Browse files- src/app.py +60 -14
src/app.py
CHANGED
|
@@ -374,38 +374,84 @@ with tab3:
|
|
| 374 |
st.markdown("### 1οΈβ£ Raw Data Distribution")
|
| 375 |
st.caption("Visualize editing values across all positions and samples β before any binary labelling.")
|
| 376 |
|
| 377 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
|
| 379 |
# Melt data to long format: (sample, position_index, value)
|
| 380 |
-
melted =
|
| 381 |
melted["Position_idx"] = melted["Position"].apply(
|
| 382 |
lambda x: int(re.search(r"(\d+)", str(x)).group(1)) if re.search(r"(\d+)", str(x)) else 0
|
| 383 |
)
|
| 384 |
-
if log_toggle:
|
| 385 |
-
melted["Value"] = np.log1p(melted["Value"])
|
| 386 |
-
value_label = "Editing Value (log1p)"
|
| 387 |
-
else:
|
| 388 |
-
value_label = "Editing Value"
|
| 389 |
|
| 390 |
# =====================================================
|
| 391 |
# PLOT 2: Histogram β all values
|
| 392 |
# =====================================================
|
| 393 |
st.markdown("#### π Histogram β All Values")
|
| 394 |
|
| 395 |
-
n_bins = st.
|
| 396 |
|
| 397 |
fig2, ax2 = plt.subplots(figsize=(10, 4))
|
| 398 |
ax2.hist(melted["Value"].values, bins=n_bins, color="#4F46E5", edgecolor="white", linewidth=0.3)
|
| 399 |
ax2.set_xlabel(value_label)
|
| 400 |
ax2.set_ylabel("Count")
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
val_max = melted["Value"].max()
|
| 405 |
-
|
| 406 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
else:
|
| 408 |
-
|
|
|
|
|
|
|
| 409 |
ax2.tick_params(axis='x', labelsize=8, rotation=45)
|
| 410 |
ax2.grid(axis='y', alpha=0.3)
|
| 411 |
fig2.tight_layout()
|
|
|
|
| 374 |
st.markdown("### 1οΈβ£ Raw Data Distribution")
|
| 375 |
st.caption("Visualize editing values across all positions and samples β before any binary labelling.")
|
| 376 |
|
| 377 |
+
transform_option = st.selectbox(
|
| 378 |
+
"Value transformation:",
|
| 379 |
+
["Raw (linear)", "log1p", "log1p β log1p", "log1p β pos. norm."],
|
| 380 |
+
index=0,
|
| 381 |
+
key="transform_select",
|
| 382 |
+
help=(
|
| 383 |
+
"**Raw** β No transformation.\n\n"
|
| 384 |
+
"**log1p** β `log(1 + x)`. Compresses high values, spreads low range.\n\n"
|
| 385 |
+
"**log1p β log1p** β Double log1p. Even stronger compression.\n\n"
|
| 386 |
+
"**log1p β pos. norm.** β log1p then robust per-position normalization "
|
| 387 |
+
"(median / IQR scaling per position column)."
|
| 388 |
+
)
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
# --- Apply transforms ---
|
| 392 |
+
def robust_pos_normalize_log1p(data: pd.DataFrame) -> pd.DataFrame:
|
| 393 |
+
"""log1p then robust per-position normalization (median + IQR)."""
|
| 394 |
+
logged = np.log1p(data)
|
| 395 |
+
result = logged.copy()
|
| 396 |
+
for col in result.columns:
|
| 397 |
+
med = result[col].median()
|
| 398 |
+
q75, q25 = result[col].quantile(0.75), result[col].quantile(0.25)
|
| 399 |
+
iqr = q75 - q25
|
| 400 |
+
if iqr > 0:
|
| 401 |
+
result[col] = (result[col] - med) / iqr
|
| 402 |
+
else:
|
| 403 |
+
result[col] = result[col] - med
|
| 404 |
+
return result
|
| 405 |
+
|
| 406 |
+
if transform_option == "log1p":
|
| 407 |
+
transformed = np.log1p(pos_data)
|
| 408 |
+
value_label = "Editing Value (log1p)"
|
| 409 |
+
transform_tag = "log1p"
|
| 410 |
+
elif transform_option == "log1p β log1p":
|
| 411 |
+
transformed = np.log1p(np.log1p(pos_data))
|
| 412 |
+
value_label = "Editing Value (log1p β log1p)"
|
| 413 |
+
transform_tag = "log1p_log1p"
|
| 414 |
+
elif transform_option == "log1p β pos. norm.":
|
| 415 |
+
transformed = robust_pos_normalize_log1p(pos_data)
|
| 416 |
+
value_label = "Editing Value (log1p β pos. norm.)"
|
| 417 |
+
transform_tag = "log1p_posnorm"
|
| 418 |
+
else:
|
| 419 |
+
transformed = pos_data
|
| 420 |
+
value_label = "Editing Value"
|
| 421 |
+
transform_tag = "raw"
|
| 422 |
|
| 423 |
# Melt data to long format: (sample, position_index, value)
|
| 424 |
+
melted = transformed.melt(var_name="Position", value_name="Value")
|
| 425 |
melted["Position_idx"] = melted["Position"].apply(
|
| 426 |
lambda x: int(re.search(r"(\d+)", str(x)).group(1)) if re.search(r"(\d+)", str(x)) else 0
|
| 427 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
# =====================================================
|
| 430 |
# PLOT 2: Histogram β all values
|
| 431 |
# =====================================================
|
| 432 |
st.markdown("#### π Histogram β All Values")
|
| 433 |
|
| 434 |
+
n_bins = st.number_input("Number of bins:", min_value=10, max_value=300, value=80, step=10, key="hist_bins")
|
| 435 |
|
| 436 |
fig2, ax2 = plt.subplots(figsize=(10, 4))
|
| 437 |
ax2.hist(melted["Value"].values, bins=n_bins, color="#4F46E5", edgecolor="white", linewidth=0.3)
|
| 438 |
ax2.set_xlabel(value_label)
|
| 439 |
ax2.set_ylabel("Count")
|
| 440 |
+
ax2.set_title(f"Raw Values Distribution ({transform_tag})")
|
| 441 |
+
# Fine x-axis ticks adapted to transform range
|
| 442 |
+
val_min = melted["Value"].min()
|
| 443 |
val_max = melted["Value"].max()
|
| 444 |
+
val_range = val_max - val_min
|
| 445 |
+
if val_range <= 2:
|
| 446 |
+
tick_step = 0.1
|
| 447 |
+
elif val_range <= 6:
|
| 448 |
+
tick_step = 0.2
|
| 449 |
+
elif val_range <= 20:
|
| 450 |
+
tick_step = 1
|
| 451 |
else:
|
| 452 |
+
tick_step = 5
|
| 453 |
+
ax2.set_xticks(np.arange(np.floor(val_min / tick_step) * tick_step,
|
| 454 |
+
val_max + tick_step, tick_step))
|
| 455 |
ax2.tick_params(axis='x', labelsize=8, rotation=45)
|
| 456 |
ax2.grid(axis='y', alpha=0.3)
|
| 457 |
fig2.tight_layout()
|