hmacdope-omsf hmacdope commited on
Commit
986648a
·
verified ·
1 Parent(s): be49025

Update Eval (#2)

Browse files

- update eval (d485d48fddb6008589f34301c05c8d8eadcf7d4f)


Co-authored-by: Hugo MacDermott-Opeskin <hmacdope@users.noreply.huggingface.co>

Files changed (2) hide show
  1. evaluate.py +55 -40
  2. utils.py +17 -13
evaluate.py CHANGED
@@ -7,9 +7,8 @@ from about import (
7
  submissions_repo,
8
  results_repo,
9
  test_repo,
10
- multiplier_dict,
11
  )
12
- from utils import bootstrap_metrics, convert_to_log
13
  from huggingface_hub import hf_hub_download
14
  import datetime
15
  import io
@@ -252,6 +251,7 @@ def evaluate_data(filename: str) -> None:
252
  Path(tmp_name).unlink()
253
 
254
 
 
255
  def calculate_metrics(
256
  results_dataframe: pd.DataFrame,
257
  test_dataframe: pd.DataFrame
@@ -263,60 +263,75 @@ def calculate_metrics(
263
  # 1) Check all columns are present
264
  _check_required_columns(results_dataframe, "Results file", ["Molecule Name"] + ENDPOINTS)
265
  _check_required_columns(test_dataframe, "Test file", ["Molecule Name"] + ENDPOINTS)
 
 
266
  # 2) Check all Molecules in the test set are present in the predictions
267
- merged_df = pd.merge(test_dataframe, results_dataframe, on=['Molecule Name'], how='left', indicator=True)
268
- if not (merged_df['_merge'] == 'both').all():
269
  raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
270
- # TODO: What to do when a molecule is duplicated in the Predictions file?
271
 
272
- # Compute leaderboard DataFrame
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
274
  all_endpoint_results = []
275
- for i, measurement in enumerate(ENDPOINTS):
276
- df_pred = results_dataframe[['Molecule Name', measurement]].copy()
277
- df_true = test_dataframe[['Molecule Name', measurement]].copy()
278
- # coerce numeric columns
279
- df_pred[measurement] = pd.to_numeric(df_pred[measurement], errors="coerce")
280
- df_true[measurement] = pd.to_numeric(df_true[measurement], errors="coerce")
281
-
282
- if df_pred[measurement].isnull().all():
283
- # TODO: Allow missing endpoints or raise an error?
284
- raise gr.Error(f"All predictions are missing for endpoint {measurement}. Please provide valid predictions.")
 
285
 
286
- # Drop NaNs and calculate coverage
287
- merged = (
288
- df_pred.rename(columns={measurement: f"{measurement}_pred"})
289
- .merge(
290
- df_true.rename(columns={measurement: f"{measurement}_true"}),
291
- on="Molecule Name",
292
- how="inner",
293
- )
294
- .dropna(subset=[f"{measurement}_pred", f"{measurement}_true"])
295
- )
296
- merged = merged.sort_values("Molecule Name", kind="stable")
297
- pred_col = f"{measurement}_pred"
298
- true_col = f"{measurement}_true"
299
-
300
- if measurement not in ['logD']:
301
- # Force log scale for all endpoints except LogD (for outliers)
302
- merged[pred_col] = convert_to_log(merged[pred_col], multiplier_dict.get(measurement, 1)).to_numpy()
303
- merged[true_col] = convert_to_log(merged[true_col], multiplier_dict.get(measurement, 1)).to_numpy()
304
-
305
- y_pred = merged[pred_col].to_numpy()
306
- y_true = merged[true_col].to_numpy()
307
- # Calculate dataframe with the metrics for 1000 bootstraps
308
- bootstrap_df = bootstrap_metrics(y_pred, y_true, measurement, n_bootstrap_samples=1000)
309
  df_endpoint = bootstrap_df.pivot_table(
310
  index=["Endpoint"],
311
  columns="Metric",
312
  values="Value",
313
  aggfunc=["mean", "std"]
314
  ).reset_index()
 
315
  # Get a df with columns 'mean_MAE', 'std_MAE', ...
316
  df_endpoint.columns = [
317
  f'{i}_{j}' if i != '' else j for i, j in df_endpoint.columns
318
  ]
319
- df_endpoint.rename(columns={'Endpoint_': 'Endpoint'}, inplace=True)
 
320
  all_endpoint_results.append(df_endpoint)
321
 
322
  df_results = pd.concat(all_endpoint_results, ignore_index=True)
 
7
  submissions_repo,
8
  results_repo,
9
  test_repo,
 
10
  )
11
+ from utils import bootstrap_metrics, clip_and_log_transform
12
  from huggingface_hub import hf_hub_download
13
  import datetime
14
  import io
 
251
  Path(tmp_name).unlink()
252
 
253
 
254
+
255
  def calculate_metrics(
256
  results_dataframe: pd.DataFrame,
257
  test_dataframe: pd.DataFrame
 
263
  # 1) Check all columns are present
264
  _check_required_columns(results_dataframe, "Results file", ["Molecule Name"] + ENDPOINTS)
265
  _check_required_columns(test_dataframe, "Test file", ["Molecule Name"] + ENDPOINTS)
266
+
267
+
268
  # 2) Check all Molecules in the test set are present in the predictions
269
+ if not (results_dataframe['Molecule Name'].isin(test_dataframe['Molecule Name'])).all():
 
270
  raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
 
271
 
272
+
273
+ # 3) check no duplicated molecules in the predictions file
274
+ if results_dataframe['Molecule Name'].duplicated().any():
275
+ raise gr.Error("The predictions file contains duplicated molecules. Please ensure each molecule is only listed once.")
276
+
277
+ # 4) Merge dataframes to ensure alignment
278
+ merged_df = results_dataframe.merge(
279
+ test_dataframe,
280
+ on="Molecule Name",
281
+ suffixes=('_pred', '_true'),
282
+ how="inner"
283
+ )
284
+ merged_df = merged_df.sort_values("Molecule Name")
285
+
286
+ # 5) loop over endpoints
287
+
288
  final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
289
  all_endpoint_results = []
290
+
291
+ for ept in ENDPOINTS:
292
+ pred_col = f"{ept}_pred"
293
+ true_col = f"{ept}_true"
294
+
295
+ # cast to numeric, coerce errors to NaN
296
+ merged_df[pred_col] = pd.to_numeric(merged_df[pred_col], errors="coerce")
297
+ merged_df[true_col] = pd.to_numeric(merged_df[true_col], errors="coerce")
298
+
299
+ if merged_df[pred_col].isnull().all():
300
+ raise gr.Error(f"All predictions are missing for endpoint {ept}. Please provide valid predictions.")
301
 
302
+ # subset and drop NaNs
303
+ subset = merged_df[[pred_col, true_col]].dropna()
304
+ if subset.empty:
305
+ raise gr.Error(f"No valid data available for endpoint {ept} after removing NaNs.")
306
+
307
+ # extract numpy arrays
308
+ y_pred = subset[pred_col].to_numpy()
309
+ y_true = subset[true_col].to_numpy()
310
+
311
+ # apply log10 + 1 transform except for logD
312
+ if ept.lower() not in ['logd']:
313
+ y_true_log = clip_and_log_transform(y_true)
314
+ y_pred_log = clip_and_log_transform(y_pred)
315
+
316
+ else:
317
+ y_true_log = y_true
318
+ y_pred_log = y_pred
319
+
320
+ # calculate metrics with bootstrapping
321
+ bootstrap_df = bootstrap_metrics(y_pred_log, y_true_log, ept, n_bootstrap_samples=1000)
 
 
 
322
  df_endpoint = bootstrap_df.pivot_table(
323
  index=["Endpoint"],
324
  columns="Metric",
325
  values="Value",
326
  aggfunc=["mean", "std"]
327
  ).reset_index()
328
+
329
  # Get a df with columns 'mean_MAE', 'std_MAE', ...
330
  df_endpoint.columns = [
331
  f'{i}_{j}' if i != '' else j for i, j in df_endpoint.columns
332
  ]
333
+
334
+ df_endpoint.rename(columns={'Endpoint_': 'Endpoint'}, inplace=True)
335
  all_endpoint_results.append(df_endpoint)
336
 
337
  df_results = pd.concat(all_endpoint_results, ignore_index=True)
utils.py CHANGED
@@ -57,11 +57,19 @@ def fetch_dataset_df():
57
  latest.rename(columns={"submission_time": "submission time"}, inplace=True)
58
  return latest
59
 
60
- def convert_to_log(data: pd.Series, multiplier: float) -> pd.Series:
61
- # Add 0.01 to avoid inf
62
- values = np.clip(data, a_min=0.01, a_max=None)
63
- values = values * multiplier # Adjust units
64
- return np.log10(values)
 
 
 
 
 
 
 
 
65
 
66
  def bootstrap_sampling(size: int, n_samples: int) -> np.ndarray:
67
  """
@@ -87,14 +95,12 @@ def metrics_per_ep(pred: np.ndarray,
87
  true: np.ndarray
88
  )->Tuple[float, float, float, float]:
89
  """Predict evaluation metrics for a single sample
90
-
91
  Parameters
92
  ----------
93
  pred : np.ndarray
94
  Array with predictions
95
  true : np.ndarray
96
  Array with actual values
97
-
98
  Returns
99
  -------
100
  Tuple[float, float, float, float]
@@ -108,18 +114,17 @@ def metrics_per_ep(pred: np.ndarray,
108
  r2=np.nan
109
  else:
110
  r2 = r2_score(true, pred)
111
- spr, _ = spearmanr(true, pred)
112
- ktau, _ = kendalltau(true, pred)
113
 
114
  return mae, rae, r2, spr, ktau
115
 
116
  def bootstrap_metrics(pred: np.ndarray,
117
- true: np.ndarray,
118
- endpoint: str,
119
  n_bootstrap_samples=1000
120
  )->pd.DataFrame:
121
  """Calculate bootstrap metrics given predicted and true values
122
-
123
  Parameters
124
  ----------
125
  pred : np.ndarray
@@ -130,7 +135,6 @@ def bootstrap_metrics(pred: np.ndarray,
130
  String with endpoint
131
  n_bootstrap_samples : int, optional
132
  Size of bootstrapsample, by default 1000
133
-
134
  Returns
135
  -------
136
  pd.DataFrame
 
57
  latest.rename(columns={"submission_time": "submission time"}, inplace=True)
58
  return latest
59
 
60
+
61
+ def clip_and_log_transform(y: np.ndarray):
62
+ """
63
+ Clip to a detection limit and transform to log10 scale.
64
+
65
+ Parameters
66
+ ----------
67
+ y : np.ndarray
68
+ The array to be clipped and transformed.
69
+ """
70
+ y = np.clip(y, a_min=0, a_max=None)
71
+ return np.log10(y + 1)
72
+
73
 
74
  def bootstrap_sampling(size: int, n_samples: int) -> np.ndarray:
75
  """
 
95
  true: np.ndarray
96
  )->Tuple[float, float, float, float]:
97
  """Predict evaluation metrics for a single sample
 
98
  Parameters
99
  ----------
100
  pred : np.ndarray
101
  Array with predictions
102
  true : np.ndarray
103
  Array with actual values
 
104
  Returns
105
  -------
106
  Tuple[float, float, float, float]
 
114
  r2=np.nan
115
  else:
116
  r2 = r2_score(true, pred)
117
+ spr = spearmanr(true, pred).statistic
118
+ ktau = kendalltau(true, pred).statistic
119
 
120
  return mae, rae, r2, spr, ktau
121
 
122
  def bootstrap_metrics(pred: np.ndarray,
123
+ true: np.ndarray,
124
+ endpoint: str,
125
  n_bootstrap_samples=1000
126
  )->pd.DataFrame:
127
  """Calculate bootstrap metrics given predicted and true values
 
128
  Parameters
129
  ----------
130
  pred : np.ndarray
 
135
  String with endpoint
136
  n_bootstrap_samples : int, optional
137
  Size of bootstrapsample, by default 1000
 
138
  Returns
139
  -------
140
  pd.DataFrame