Lars Masanneck commited on
Commit
04428af
·
1 Parent(s): 96a206a

Proper initial commit

Browse files
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Copy and install dependencies
7
+ COPY requirements.txt ./
8
+ RUN pip install --no-cache-dir -r requirements.txt
9
+
10
+ # Copy application code
11
+ COPY . ./
12
+
13
+ # Expose Streamlit default port
14
+ EXPOSE 8501
15
+
16
+ # Run Streamlit app
17
+ CMD ["streamlit", "run", "app.py", "--server.address=0.0.0.0", "--server.port=8501"]
Table_1_summary_measure.csv ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import normalizer_model
3
+ import numpy as np
4
+ import pandas as pd
5
+ import altair as alt
6
+ import plotly.graph_objects as go
7
+ from scipy.stats import norm
8
+
9
+ # Configure the Streamlit page before other commands
10
+ st.set_page_config(
11
+ page_title="Smartwatch Normative Z-Score Calculator",
12
+ layout="wide",
13
+ )
14
+
15
+
16
+ # Cache the normative DataFrame load
17
+ def load_norm_df(path: str):
18
+ return normalizer_model.load_normative_table(path)
19
+
20
+
21
+ load_norm_df = st.cache_data(load_norm_df)
22
+
23
+ # Load dataset
24
+ norm_df = load_norm_df("Table_1_summary_measure.csv")
25
+
26
+ # Friendly biomarker labels
27
+ BIOMARKER_LABELS = {
28
+ "nb_steps": "Number of Steps",
29
+ "max_steps": "Maximum Steps",
30
+ "mean_active_time": "Mean Active Time",
31
+ "sbp": "Systolic Blood Pressure",
32
+ "dbp": "Diastolic Blood Pressure",
33
+ "sleep_duration": "Sleep Duration",
34
+ "avg_night_hr": "Average Night Heart Rate",
35
+ "nb_moderate_active_minutes": "Moderate Active Minutes",
36
+ "nb_vigorous_active_minutes": "Vigorous Active Minutes",
37
+ "weight": "Weight",
38
+ "pwv": "Pulse Wave Velocity",
39
+ # add any others here
40
+ }
41
+
42
+
43
+ def main():
44
+ if "disclaimer_shown" not in st.session_state:
45
+ st.info(
46
+ "These calculations are dedicated for scientifically purposes only. "
47
+ "For detailed questions regarding personal health data contact your "
48
+ "healthcare professionals."
49
+ )
50
+ st.session_state.disclaimer_shown = True
51
+ st.title("Smartwatch Normative Z-Score Calculator")
52
+ st.sidebar.header("Input Parameters")
53
+
54
+ # Region with default Western Europe
55
+ regions = sorted(norm_df["area"].unique())
56
+ if "Western Europe" in regions:
57
+ default_region = "Western Europe"
58
+ else:
59
+ default_region = regions[0]
60
+ region = st.sidebar.selectbox(
61
+ "Region",
62
+ regions,
63
+ index=regions.index(default_region),
64
+ )
65
+
66
+ # Gender selection
67
+ gender = st.sidebar.selectbox(
68
+ "Gender",
69
+ sorted(norm_df["gender"].unique()),
70
+ )
71
+
72
+ # Age input: choose between years or group
73
+ st.sidebar.subheader("Age Input")
74
+ age_input_mode = st.sidebar.radio(
75
+ "Age input mode",
76
+ ("Years", "Group"),
77
+ )
78
+ if age_input_mode == "Years":
79
+ age_years = st.sidebar.number_input(
80
+ "Age (years)",
81
+ min_value=0,
82
+ max_value=120,
83
+ value=30,
84
+ step=1,
85
+ )
86
+ age_param = age_years
87
+ else:
88
+ age_groups = sorted(
89
+ norm_df["Age"].unique(),
90
+ key=lambda x: int(x.split("-")[0]),
91
+ )
92
+ age_group = st.sidebar.selectbox("Age group", [""] + age_groups)
93
+ age_param = age_group
94
+
95
+ # BMI input: choose between value or category
96
+ st.sidebar.subheader("BMI Input")
97
+ bmi_input_mode = st.sidebar.radio(
98
+ "BMI input mode",
99
+ ("Value", "Category"),
100
+ )
101
+ if bmi_input_mode == "Value":
102
+ bmi_val = st.sidebar.number_input(
103
+ "BMI",
104
+ min_value=0.0,
105
+ max_value=100.0,
106
+ value=24.0,
107
+ step=0.1,
108
+ format="%.1f",
109
+ )
110
+ bmi_param = bmi_val
111
+ else:
112
+ bmi_cats = sorted(norm_df["Bmi"].unique())
113
+ bmi_cat = st.sidebar.selectbox("BMI category", [""] + bmi_cats)
114
+ bmi_param = bmi_cat
115
+
116
+ # Biomarker selection with friendly labels
117
+ codes = sorted(norm_df["Biomarkers"].unique())
118
+ friendly = [BIOMARKER_LABELS.get(c, c.title()) for c in codes]
119
+ default_idx = friendly.index("Number of Steps")
120
+ selected_label = st.sidebar.selectbox(
121
+ "Biomarker",
122
+ friendly,
123
+ index=default_idx,
124
+ )
125
+ biomarker = codes[friendly.index(selected_label)]
126
+
127
+ # Value input with consistent float types
128
+ default_value = 6500.0 if biomarker == "nb_steps" else 0.0
129
+ # Determine upper bound from normative data
130
+ mask = norm_df["Biomarkers"].str.lower() == biomarker.lower()
131
+ max_val = float(norm_df.loc[mask, "max"].max())
132
+ value = st.sidebar.number_input(
133
+ f"{selected_label} value",
134
+ min_value=0.0,
135
+ max_value=max_val,
136
+ value=default_value,
137
+ step=1.0,
138
+ )
139
+
140
+ # Compute
141
+ norm_button = st.sidebar.button("Compute Normative Z-Score")
142
+ if norm_button:
143
+ try:
144
+ res = normalizer_model.compute_normative_position(
145
+ value=value,
146
+ biomarker=biomarker,
147
+ age_group=age_param,
148
+ region=region,
149
+ gender=gender,
150
+ bmi=bmi_param,
151
+ normative_df=norm_df,
152
+ )
153
+ except Exception as e:
154
+ st.error(f"Error: {e}")
155
+ return
156
+
157
+ # Show metrics
158
+ st.subheader("Results")
159
+ m1, m2, m3, m4, m5 = st.columns(5)
160
+ m1.metric("Z-Score", f"{res['z_score']:.2f}")
161
+ m2.metric("Percentile", f"{res['percentile']:.2f}")
162
+ m3.metric("Mean", f"{res['mean']:.2f}")
163
+ m4.metric("SD", f"{res['sd']:.2f}")
164
+ m5.metric("Sample Size", res["n"])
165
+
166
+ # Compute actual age group and BMI category for cohort summary
167
+ age_group_str = normalizer_model._categorize_age(age_param, norm_df)
168
+ bmi_cat = normalizer_model.categorize_bmi(bmi_param)
169
+ st.markdown(
170
+ f"**Basis of calculation:** Data from region **{region}**, "
171
+ f"gender **{gender}**, age group **{age_group_str}**, "
172
+ f"and BMI category **{bmi_cat}. "
173
+ f"Sample size: {res['n']}**."
174
+ )
175
+
176
+ # Detailed statistics table
177
+ st.subheader("Detailed Statistics")
178
+ stats_df = pd.DataFrame(
179
+ {
180
+ "Statistic": [
181
+ "Z-Score",
182
+ "Percentile",
183
+ "Mean",
184
+ "SD",
185
+ "Sample Size",
186
+ "Median",
187
+ "Q1",
188
+ "Q3",
189
+ "IQR",
190
+ "MAD",
191
+ "SE",
192
+ "CI",
193
+ ],
194
+ "Value": [
195
+ f"{res['z_score']:.2f}",
196
+ f"{res['percentile']:.2f}",
197
+ f"{res['mean']:.2f}",
198
+ f"{res['sd']:.2f}",
199
+ res.get("n", "N/A"),
200
+ f"{res.get('median', float('nan')):.2f}",
201
+ f"{res.get('q1', float('nan')):.2f}",
202
+ f"{res.get('q3', float('nan')):.2f}",
203
+ f"{res.get('iqr', float('nan')):.2f}",
204
+ f"{res.get('mad', float('nan')):.2f}",
205
+ f"{res.get('se', float('nan')):.2f}",
206
+ f"{res.get('ci', float('nan')):.2f}",
207
+ ],
208
+ }
209
+ )
210
+ st.table(stats_df)
211
+
212
+ # Normality assumption note
213
+ note = (
214
+ "*Note: Percentile and z-score estimation assume a normal "
215
+ "distribution based on global Withings user data stratified by "
216
+ "the parameters entered.*"
217
+ )
218
+ st.write(note)
219
+
220
+ # Normality checks
221
+ import normality_checks as nc
222
+
223
+ R = nc.iqr_tail_heaviness(res["iqr"], res["sd"])
224
+ q1_z, q3_z = nc.quartile_z_scores(
225
+ res["mean"],
226
+ res["sd"],
227
+ res["q1"],
228
+ res["q3"],
229
+ )
230
+ skew = nc.pearson_skewness(res["mean"], res["median"], res["sd"])
231
+ st.subheader("Normality Heuristics")
232
+
233
+ # Determine skewness interpretation
234
+ if abs(skew) <= 0.1:
235
+ skew_interp = "Symmetric (OK)"
236
+ elif abs(skew) <= 0.5:
237
+ skew_interp = f"{'Right' if skew > 0 else 'Left'} slight skew (usually OK)"
238
+ elif abs(skew) <= 1.0:
239
+ skew_interp = f"{'Right' if skew > 0 else 'Left'} noticeable skew"
240
+ else:
241
+ skew_interp = f"{'Right' if skew > 0 else 'Left'} strong skew"
242
+
243
+ norm_checks = pd.DataFrame(
244
+ {
245
+ "Check": [
246
+ "IQR/SD",
247
+ "Q1 z-score",
248
+ "Q3 z-score",
249
+ "Pearson Skewness",
250
+ ],
251
+ "Value": [
252
+ f"{R:.2f}",
253
+ f"{q1_z:.2f}",
254
+ f"{q3_z:.2f}",
255
+ f"{skew:.2f}",
256
+ ],
257
+ "Flag": [
258
+ (
259
+ "Heavier tails"
260
+ if R > 1.5
261
+ else "Lighter tails" if R < 1.2 else "OK"
262
+ ),
263
+ "Deviation" if abs(q1_z + 0.6745) > 0.1 else "OK",
264
+ "Deviation" if abs(q3_z - 0.6745) > 0.1 else "OK",
265
+ skew_interp,
266
+ ],
267
+ }
268
+ )
269
+ st.table(norm_checks)
270
+
271
+ # Add skewness interpretation guide
272
+ st.markdown(
273
+ """
274
+ **Pearson Skewness Interpretation:**
275
+ - ≈ 0: Symmetric distribution
276
+ - ±0.1 to ±0.5: Slight/moderate skew
277
+ - ±0.5 to ±1: Noticeable skew
278
+ - larger than±1: Strong skew
279
+
280
+ - Positive values: Right skew (longer tail on right)
281
+ - Negative values: Left skew (longer tail on left)
282
+ """
283
+ )
284
+
285
+ # Warning if heuristic checks indicate non-normality
286
+ if any(("OK" not in val) for val in norm_checks["Flag"]):
287
+ st.warning(
288
+ "Warning: Heuristic checks indicate possible deviations "
289
+ "from normality; interpret z-score and percentiles with "
290
+ "caution."
291
+ )
292
+
293
+ # Skew-Corrected Results (optional)
294
+ with st.expander("Optional: Skew-Corrected Results"):
295
+ st.write("Adjusts for skew via Pearson Type III back-transform.")
296
+ st.write("Error often <1 percentile point when |skew| ≤ 0.5.")
297
+ st.write("Usually more useful for stronger skewed distributions.")
298
+ st.write("Note: This is a heuristic and may not always be accurate.")
299
+ res_skew = normalizer_model.compute_skew_corrected_position(
300
+ value=value,
301
+ mean=res["mean"],
302
+ sd=res["sd"],
303
+ median=res["median"],
304
+ )
305
+ pct_skew = f"{res_skew['percentile_skew_corrected']:.2f}"
306
+ sc1, sc2 = st.columns(2)
307
+ sc1.metric(
308
+ "Skew-Corrected Z-Score",
309
+ f"{res_skew['z_skew_corrected']:.2f}",
310
+ )
311
+ sc2.metric(
312
+ "Skew-Corrected Percentile",
313
+ pct_skew,
314
+ )
315
+
316
+ st.markdown("---")
317
+ st.subheader("Visualizations")
318
+ # Prepare data for normal distribution
319
+ z_vals = np.linspace(-4, 4, 400)
320
+ density = norm.pdf(z_vals)
321
+ df_chart = pd.DataFrame({"z": z_vals, "density": density})
322
+ # Shade area up to observed z-score
323
+ area = (
324
+ alt.Chart(df_chart)
325
+ .mark_area(color="orange", opacity=0.3)
326
+ .transform_filter(alt.datum.z <= res["z_score"])
327
+ .encode(
328
+ x=alt.X(
329
+ "z:Q",
330
+ title="z-score",
331
+ ),
332
+ y=alt.Y(
333
+ "density:Q",
334
+ title="Density",
335
+ ),
336
+ )
337
+ )
338
+ # Plot distribution line
339
+ line = (
340
+ alt.Chart(df_chart)
341
+ .mark_line(color="orange")
342
+ .encode(
343
+ x="z:Q",
344
+ y="density:Q",
345
+ )
346
+ )
347
+ # Vertical line at observed z
348
+ vline = (
349
+ alt.Chart(pd.DataFrame({"z": [res["z_score"]]}))
350
+ .mark_rule(color="orange")
351
+ .encode(x="z:Q")
352
+ )
353
+ chart = (area + line + vline).properties(
354
+ width=600,
355
+ height=300,
356
+ title="Standard Normal Distribution",
357
+ )
358
+ st.altair_chart(chart, use_container_width=True)
359
+ # Text summary
360
+ st.write(
361
+ f"Your value is z = {res['z_score']:.2f}, which places you in "
362
+ f"the {res['percentile']:.1f}th percentile of a normal "
363
+ f"distribution."
364
+ )
365
+ # Bullet chart showing z-score location
366
+ # Using a horizontal bullet gauge from -3 to 3 SD
367
+ bullet = go.Figure(
368
+ go.Indicator(
369
+ mode="number+gauge",
370
+ value=res["z_score"],
371
+ number={"suffix": " SD"},
372
+ gauge={
373
+ "shape": "bullet",
374
+ "axis": {
375
+ "range": [-3, 3],
376
+ "tickmode": "linear",
377
+ "dtick": 0.5,
378
+ },
379
+ "bar": {"color": "orange"},
380
+ },
381
+ )
382
+ )
383
+ bullet.update_layout(
384
+ height=150,
385
+ margin={"t": 20, "b": 20, "l": 20, "r": 20},
386
+ )
387
+ st.plotly_chart(bullet, use_container_width=True)
388
+ # Show percentile text
389
+ st.write(f"Percentile: {res['percentile']:.1f}%")
390
+ else:
391
+ st.sidebar.info(
392
+ "Fill in all inputs and click Compute " "to get normative Z-score."
393
+ )
394
+
395
+ # Footer
396
+ st.markdown("---")
397
+ st.markdown(
398
+ "Built in with ❤️ in Düsseldorf. © Lars Masanneck 2025. "
399
+ "Thanks to Withings for sharing this data openly."
400
+ )
401
+
402
+
403
+ if __name__ == "__main__":
404
+ main()
normality_checks.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ normality_checks.py
3
+
4
+ Module for normality check heuristics.
5
+ """
6
+
7
+ from typing import Tuple
8
+
9
+
10
+ def iqr_tail_heaviness(iqr: float, sd: float) -> float:
11
+ """Return ratio R = IQR/SD for tail heaviness checking."""
12
+ return iqr / sd if sd != 0 else float("nan")
13
+
14
+
15
+ def quartile_z_scores(
16
+ mean: float,
17
+ sd: float,
18
+ q1: float,
19
+ q3: float,
20
+ ) -> Tuple[float, float]:
21
+ """Return observed z-scores for Q1 and Q3."""
22
+ if sd == 0:
23
+ return (float("nan"), float("nan"))
24
+ q1_z = (q1 - mean) / sd
25
+ q3_z = (q3 - mean) / sd
26
+ return q1_z, q3_z
27
+
28
+
29
+ def pearson_skewness(mean: float, median: float, sd: float) -> float:
30
+ """Return Pearson's moment coefficient of skewness."""
31
+ return 3 * (mean - median) / sd if sd != 0 else float("nan")
normalizer_model.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ normative_calculator.py - v2
3
+
4
+ Utility functions for computing z-scores and percentiles for any biomarker
5
+ contained in *Table_1_summary_measure.xlsx*.
6
+
7
+
8
+
9
+ Author: Lars Masanneck 06-05-2025
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import math
15
+ import pathlib
16
+ import warnings
17
+ from typing import Dict, Iterable, List, Sequence, Union
18
+
19
+ import pandas as pd
20
+ from scipy import stats
21
+ from datetime import datetime
22
+
23
+
24
+ ###############################################################################
25
+ # Public API (re-exported in __all__)
26
+ ###############################################################################
27
+
28
+ __all__ = [
29
+ "load_normative_table",
30
+ "compute_normative_position",
31
+ "add_normative_columns",
32
+ "categorize_bmi",
33
+ "compute_skew_corrected_position",
34
+ ]
35
+
36
+ ###############################################################################
37
+ # Constant category mappings
38
+ ###############################################################################
39
+
40
+ # BMI categories (WHO definition)
41
+ _BMI_BOUNDS: List[tuple[float, float, str]] = [
42
+ (0, 18.5, "Underweight"),
43
+ (18.5, 25, "Healthy"),
44
+ (25, 30, "Overweight"),
45
+ (30, math.inf, "Obesity"),
46
+ ]
47
+
48
+ ###############################################################################
49
+ # Helper functions – categories & loading
50
+ ###############################################################################
51
+
52
+
53
+ def _categorize(value: float, bounds: Sequence[tuple]) -> str:
54
+ """Return category *label* for *value* given (lower, upper, label) tuples."""
55
+ for lower, upper, label in bounds:
56
+ if lower <= value < upper:
57
+ return label
58
+ raise ValueError(f"{value} outside defined bounds.")
59
+
60
+
61
+ def categorize_bmi(bmi: Union[str, float]) -> str:
62
+ """Map numeric BMI to the table's BMI category strings."""
63
+ if isinstance(bmi, str):
64
+ return bmi.strip().capitalize()
65
+ return _categorize(float(bmi), _BMI_BOUNDS)
66
+
67
+
68
+ def _categorize_age(age: Union[str, int], normative_df: pd.DataFrame) -> str:
69
+ """Return an age‐group string for a numeric age, or pass through if already a string."""
70
+ if isinstance(age, str):
71
+ return age.strip()
72
+ for grp in normative_df["Age"].unique():
73
+ grp = grp.strip()
74
+ if "-" in grp:
75
+ lo, hi = grp.split("-", 1)
76
+ try:
77
+ lo_i, hi_i = int(lo), int(hi)
78
+ except ValueError:
79
+ continue
80
+ if lo_i <= age <= hi_i:
81
+ return grp
82
+ elif grp.endswith("+"):
83
+ try:
84
+ lo_i = int(grp[:-1])
85
+ except ValueError:
86
+ continue
87
+ if age >= lo_i:
88
+ return grp
89
+ raise ValueError(f"No normative age group found for age {age!r}.")
90
+
91
+
92
+ def load_normative_table(path):
93
+ path = pathlib.Path(path)
94
+ if not path.exists():
95
+ raise FileNotFoundError(path)
96
+ # columns to keep as strings
97
+ str_cols = ["Age", "area", "gender", "Bmi", "Biomarkers", "nb_category"]
98
+ # columns to cast to floats (recovering numbers from any date‐formatted cells)
99
+ float_cols = [
100
+ "min",
101
+ "max",
102
+ "median",
103
+ "q1",
104
+ "q3",
105
+ "iqr",
106
+ "mad",
107
+ "mean",
108
+ "sd",
109
+ "se",
110
+ "ci",
111
+ ]
112
+
113
+ def parse_num(x):
114
+ # Excel‐formatted dates get parsed into datetime; map back to original float:
115
+ if isinstance(x, datetime):
116
+ # if year is in the future (e.g. 3183 → original was 3183.xx),
117
+ # treat year as integer part and month as two‐digit fractional
118
+ if x.year > datetime.now().year:
119
+ return x.year + x.month / 100
120
+ # otherwise (small numbers like 5.06 → parsed as 2025-06-05),
121
+ # use day as integer and month as two‐digit fractional
122
+ return x.day + x.month / 100
123
+ # non‐dates: just a normal float cast (coerce errors to NA)
124
+ try:
125
+ return float(x)
126
+ except Exception:
127
+ return pd.NA
128
+
129
+ # build your converters
130
+ converters = {col: str for col in str_cols}
131
+ converters.update({col: parse_num for col in float_cols})
132
+
133
+ # read the normative table (Excel or CSV) with our converters
134
+ if path.suffix.lower() == ".csv":
135
+ df = pd.read_csv(path, converters=converters)
136
+ else:
137
+ df = pd.read_excel(path, converters=converters)
138
+
139
+ # ensure string cols are truly str dtype
140
+ for c in str_cols:
141
+ df[c] = df[c].astype(str)
142
+ df.columns = df.columns.str.strip()
143
+
144
+ return df
145
+
146
+
147
+ ###############################################################################
148
+ # Core calculus
149
+ ###############################################################################
150
+
151
+
152
+ def _extract_stats(
153
+ normative_df: pd.DataFrame,
154
+ biomarker: str,
155
+ age_group: str,
156
+ region: str,
157
+ gender: str,
158
+ bmi_category: str,
159
+ ) -> Dict[str, Union[float, str]]:
160
+ """Return all summary statistics for the requested stratum."""
161
+ mask = (
162
+ (normative_df["Biomarkers"].str.lower() == biomarker.lower())
163
+ & (normative_df["Age"].str.lower() == age_group.lower())
164
+ & (normative_df["area"].str.lower() == region.lower())
165
+ & (normative_df["gender"].str.lower() == gender.lower())
166
+ & (normative_df["Bmi"].str.lower() == bmi_category.lower())
167
+ )
168
+ subset = normative_df.loc[mask]
169
+ if subset.empty:
170
+ raise KeyError("No normative stats found for the specified stratum.")
171
+ if len(subset) > 1:
172
+ warnings.warn(
173
+ "Multiple normative rows found; using the first one (check your table)."
174
+ )
175
+ row = subset.iloc[0]
176
+ # Some versions of the table label sample size as "n" instead of "nb_category"
177
+ n_col = "nb_category" if "nb_category" in row else "n"
178
+ n_raw = row[n_col]
179
+ n = str(row[n_col])
180
+
181
+ return {
182
+ "median": float(row["median"]),
183
+ "q1": float(row["q1"]),
184
+ "q3": float(row["q3"]),
185
+ "iqr": float(row["iqr"]),
186
+ "mad": float(row["mad"]),
187
+ "mean": float(row["mean"]),
188
+ "sd": float(row["sd"]),
189
+ "se": float(row["se"]),
190
+ "ci": float(row["ci"]),
191
+ "n": n,
192
+ }
193
+
194
+
195
+ def z_score(value: float, mean: float, sd: float) -> float:
196
+ """Compute z-score; returns NaN if SD is 0."""
197
+ if sd == 0:
198
+ return float("nan")
199
+ return (value - mean) / sd
200
+
201
+
202
+ def percentile_from_z(z: float) -> float:
203
+ """Convert z-score to percentile (0-100)."""
204
+ return float(stats.norm.cdf(z) * 100)
205
+
206
+
207
+ def compute_normative_position(
208
+ *,
209
+ value: float,
210
+ biomarker: str,
211
+ age_group: Union[str, int],
212
+ region: str,
213
+ gender: str,
214
+ bmi: Union[str, float],
215
+ normative_df: pd.DataFrame,
216
+ ) -> Dict[str, Union[float, str]]:
217
+ """
218
+ Compute where a single measurement falls relative to a normative distribution.
219
+
220
+ Parameters
221
+ ----------
222
+ value : float
223
+ Raw measurement for the specified biomarker.
224
+ biomarker : str
225
+ Name of the biomarker (must match a value in the "Biomarkers" column
226
+ of `normative_df`).
227
+ age_group : Union[str, int]
228
+ Either:
229
+ - A string age-group label (e.g. "40-49") matching `normative_df["Age"]`, or
230
+ - An integer age, which will be mapped into the correct age-group bracket.
231
+ region : str
232
+ Region name matching `normative_df["area"]` (case-insensitive).
233
+ gender : str
234
+ Gender label matching `normative_df["gender"]` (case-insensitive).
235
+ bmi : Union[str, float]
236
+ Either:
237
+ - A string BMI category (e.g. "Healthy"), or
238
+ - A numeric BMI value, which will be bucketed into WHO categories.
239
+ normative_df : pd.DataFrame
240
+ Table of normative summary statistics as returned by `load_normative_table`.
241
+
242
+ Returns
243
+ -------
244
+ Dict[str, Union[float, str]]
245
+ A dictionary containing:
246
+ - "z_score" (float): the computed z-score,
247
+ - "percentile" (float): the percentile (0–100),
248
+ - "mean" (float): the normative mean,
249
+ - "sd" (float): the normative standard deviation,
250
+ - "n" (str): the sample-size category string from the normative table.
251
+ - "median" (float): the normative median,
252
+ - "q1" (float): the first quartile,
253
+ - "q3" (float): the third quartile,
254
+ - "iqr" (float): the interquartile range,
255
+ - "mad" (float): the median absolute deviation,
256
+ - "se" (float): the standard error,
257
+ - "ci" (float): the confidence interval.
258
+
259
+ Raises
260
+ ------
261
+ KeyError
262
+ If no matching stratum is found in `normative_df`.
263
+ ValueError
264
+ If an integer `age_group` cannot be mapped to any age bracket.
265
+ """
266
+ # allow numeric age inputs by mapping them to the correct "Age" group
267
+ age_group_str = _categorize_age(age_group, normative_df)
268
+ bmi_cat = categorize_bmi(bmi)
269
+ stats_d = _extract_stats(
270
+ normative_df=normative_df,
271
+ biomarker=biomarker,
272
+ age_group=age_group_str,
273
+ region=region,
274
+ gender=gender,
275
+ bmi_category=bmi_cat,
276
+ )
277
+ z = z_score(value, stats_d["mean"], stats_d["sd"])
278
+ pct = percentile_from_z(z)
279
+ return {
280
+ "z_score": z,
281
+ "percentile": pct,
282
+ "mean": stats_d["mean"],
283
+ "sd": stats_d["sd"],
284
+ "n": stats_d["n"],
285
+ "median": stats_d["median"],
286
+ "q1": stats_d["q1"],
287
+ "q3": stats_d["q3"],
288
+ "iqr": stats_d["iqr"],
289
+ "mad": stats_d["mad"],
290
+ "se": stats_d["se"],
291
+ "ci": stats_d["ci"],
292
+ }
293
+
294
+
295
+ ###############################################################################
296
+ # Batch processing helper
297
+ ###############################################################################
298
+
299
+
300
+ def _compute_for_row(
301
+ row: pd.Series,
302
+ biomarker: str,
303
+ normative_df: pd.DataFrame,
304
+ age_col: str,
305
+ region_col: str,
306
+ gender_col: str,
307
+ bmi_col: str,
308
+ value_col: str,
309
+ ):
310
+ try:
311
+ res = compute_normative_position(
312
+ value=row[value_col],
313
+ biomarker=biomarker,
314
+ age_group=row[age_col],
315
+ region=row[region_col],
316
+ gender=row[gender_col],
317
+ bmi=row[bmi_col],
318
+ normative_df=normative_df,
319
+ )
320
+ return pd.Series(
321
+ [res["z_score"], res["percentile"]],
322
+ index=[f"{biomarker}_z", f"{biomarker}_pct"],
323
+ )
324
+ except Exception as exc: # pragma: no cover
325
+ warnings.warn(str(exc))
326
+ return pd.Series(
327
+ [float("nan"), float("nan")], index=[f"{biomarker}_z", f"{biomarker}_pct"]
328
+ )
329
+
330
+
331
+ def add_normative_columns(
332
+ df: pd.DataFrame,
333
+ *,
334
+ biomarkers: Iterable[str],
335
+ normative_df: pd.DataFrame,
336
+ age_col: str = "Age",
337
+ region_col: str = "area",
338
+ gender_col: str = "gender",
339
+ bmi_col: str = "Bmi",
340
+ value_cols: dict[str, str] | None = None,
341
+ output_prefixes: dict[str, str] | None = None,
342
+ ) -> pd.DataFrame:
343
+ """
344
+ Append z-score and percentile columns for multiple biomarkers, with optional
345
+ custom prefixes for the output column names.
346
+
347
+ Parameters
348
+ ----------
349
+ df : pd.DataFrame
350
+ Participant-level data, must include demographic columns and raw biomarker
351
+ values.
352
+ biomarkers : Iterable[str]
353
+ List of biomarker names to process.
354
+ normative_df : pd.DataFrame
355
+ Normative summary table as loaded by `load_normative_table`.
356
+ age_col : str, default "Age"
357
+ Column in `df` containing age-group labels or integer ages.
358
+ region_col : str, default "area"
359
+ Column in `df` matching the "area" field in `normative_df`.
360
+ gender_col : str, default "gender"
361
+ Column in `df` matching the "gender" field in `normative_df`.
362
+ bmi_col : str, default "Bmi"
363
+ Column in `df` containing BMI values or categories.
364
+ value_cols : dict[str, str], optional
365
+ Mapping from each biomarker name to the column in `df` that holds its
366
+ raw numeric value. Defaults to identity mapping.
367
+ output_prefixes : dict[str, str], optional
368
+ Mapping from each biomarker name to the prefix to use for the output
369
+ columns. Defaults to using the biomarker name itself.
370
+
371
+ Returns
372
+ -------
373
+ pd.DataFrame
374
+ A copy of `df` with two new columns for each biomarker:
375
+ `<prefix>_z` and `<prefix>_pct`.
376
+ """
377
+ value_cols = value_cols or {bm: bm for bm in biomarkers}
378
+ output_prefixes = output_prefixes or {}
379
+ out = df.copy()
380
+
381
+ for bm in biomarkers:
382
+ prefix = output_prefixes.get(bm, bm)
383
+ out[[f"{prefix}_z", f"{prefix}_pct"]] = df.apply(
384
+ _compute_for_row,
385
+ axis=1,
386
+ biomarker=bm,
387
+ normative_df=normative_df,
388
+ age_col=age_col,
389
+ region_col=region_col,
390
+ gender_col=gender_col,
391
+ bmi_col=bmi_col,
392
+ value_col=value_cols[bm],
393
+ )
394
+
395
+ return out
396
+
397
+
398
+ # Add a function for skew-corrected z-score calculation
399
+ def compute_skew_corrected_position(
400
+ value: float, mean: float, sd: float, median: float
401
+ ) -> dict[str, float]:
402
+ """Compute skew-corrected z-score and percentile using Pearson Type III distribution."""
403
+ # Pearson's moment coefficient of skewness
404
+ if sd == 0:
405
+ skewness = float("nan")
406
+ else:
407
+ skewness = 3 * (mean - median) / sd
408
+ # Build Pearson Type III distribution (gamma-based)
409
+ dist = stats.pearson3(skewness, loc=mean, scale=sd)
410
+ # Compute percentile under skewed model
411
+ p = dist.cdf(value)
412
+ # Back-transform to standard normal z-score
413
+ z_corr = stats.norm.ppf(p)
414
+ return {"z_skew_corrected": z_corr, "percentile_skew_corrected": float(p * 100)}
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.26.0
2
+ pycountry==22.3.5
3
+ scipy==1.11.3
4
+ numpy==1.26.0
5
+ pandas==2.1.0
6
+ matplotlib==3.8.0
7
+ seaborn==0.13.0
8
+ openpyxl==3.1.2
9
+ altair==5.5.0
10
+ plotly==5.21.0
static/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+ # static files directory (for CSS, JS, images)