hmacdope commited on
Commit
0154093
·
2 Parent(s): 732f90d 986648a

Merge remote-tracking branch 'origin/main'

Browse files
Files changed (2) hide show
  1. evaluate.py +55 -39
  2. utils.py +17 -13
evaluate.py CHANGED
@@ -10,7 +10,7 @@ from about import (
10
  multiplier_dict,
11
  THROTTLE_MINUTES
12
  )
13
- from utils import bootstrap_metrics, convert_to_log, fetch_dataset_df
14
  from huggingface_hub import hf_hub_download
15
  import datetime
16
  import io
@@ -263,6 +263,7 @@ def evaluate_data(filename: str) -> None:
263
  Path(tmp_name).unlink()
264
 
265
 
 
266
  def calculate_metrics(
267
  results_dataframe: pd.DataFrame,
268
  test_dataframe: pd.DataFrame
@@ -274,60 +275,75 @@ def calculate_metrics(
274
  # 1) Check all columns are present
275
  _check_required_columns(results_dataframe, "Results file", ["Molecule Name"] + ENDPOINTS)
276
  _check_required_columns(test_dataframe, "Test file", ["Molecule Name"] + ENDPOINTS)
 
 
277
  # 2) Check all Molecules in the test set are present in the predictions
278
- merged_df = pd.merge(test_dataframe, results_dataframe, on=['Molecule Name'], how='left', indicator=True)
279
- if not (merged_df['_merge'] == 'both').all():
280
  raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
281
- # TODO: What to do when a molecule is duplicated in the Predictions file?
282
 
283
- # Compute leaderboard DataFrame
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
285
  all_endpoint_results = []
286
- for i, measurement in enumerate(ENDPOINTS):
287
- df_pred = results_dataframe[['Molecule Name', measurement]].copy()
288
- df_true = test_dataframe[['Molecule Name', measurement]].copy()
289
- # coerce numeric columns
290
- df_pred[measurement] = pd.to_numeric(df_pred[measurement], errors="coerce")
291
- df_true[measurement] = pd.to_numeric(df_true[measurement], errors="coerce")
292
-
293
- if df_pred[measurement].isnull().all():
294
- # TODO: Allow missing endpoints or raise an error?
295
- raise gr.Error(f"All predictions are missing for endpoint {measurement}. Please provide valid predictions.")
 
296
 
297
- # Drop NaNs and calculate coverage
298
- merged = (
299
- df_pred.rename(columns={measurement: f"{measurement}_pred"})
300
- .merge(
301
- df_true.rename(columns={measurement: f"{measurement}_true"}),
302
- on="Molecule Name",
303
- how="inner",
304
- )
305
- .dropna(subset=[f"{measurement}_pred", f"{measurement}_true"])
306
- )
307
- merged = merged.sort_values("Molecule Name", kind="stable")
308
- pred_col = f"{measurement}_pred"
309
- true_col = f"{measurement}_true"
310
-
311
- if measurement not in ['logD']:
312
- # Force log scale for all endpoints except LogD (for outliers)
313
- merged[pred_col] = convert_to_log(merged[pred_col], multiplier_dict.get(measurement, 1)).to_numpy()
314
- merged[true_col] = convert_to_log(merged[true_col], multiplier_dict.get(measurement, 1)).to_numpy()
315
-
316
- y_pred = merged[pred_col].to_numpy()
317
- y_true = merged[true_col].to_numpy()
318
- # Calculate dataframe with the metrics for 1000 bootstraps
319
- bootstrap_df = bootstrap_metrics(y_pred, y_true, measurement, n_bootstrap_samples=1000)
320
  df_endpoint = bootstrap_df.pivot_table(
321
  index=["Endpoint"],
322
  columns="Metric",
323
  values="Value",
324
  aggfunc=["mean", "std"]
325
  ).reset_index()
 
326
  # Get a df with columns 'mean_MAE', 'std_MAE', ...
327
  df_endpoint.columns = [
328
  f'{i}_{j}' if i != '' else j for i, j in df_endpoint.columns
329
  ]
330
- df_endpoint.rename(columns={'Endpoint_': 'Endpoint'}, inplace=True)
 
331
  all_endpoint_results.append(df_endpoint)
332
 
333
  df_results = pd.concat(all_endpoint_results, ignore_index=True)
 
10
  multiplier_dict,
11
  THROTTLE_MINUTES
12
  )
13
+ from utils import bootstrap_metrics, clip_and_log_transform, fetch_dataset_df
14
  from huggingface_hub import hf_hub_download
15
  import datetime
16
  import io
 
263
  Path(tmp_name).unlink()
264
 
265
 
266
+
267
  def calculate_metrics(
268
  results_dataframe: pd.DataFrame,
269
  test_dataframe: pd.DataFrame
 
275
  # 1) Check all columns are present
276
  _check_required_columns(results_dataframe, "Results file", ["Molecule Name"] + ENDPOINTS)
277
  _check_required_columns(test_dataframe, "Test file", ["Molecule Name"] + ENDPOINTS)
278
+
279
+
280
  # 2) Check all Molecules in the test set are present in the predictions
281
+ if not (results_dataframe['Molecule Name'].isin(test_dataframe['Molecule Name'])).all():
 
282
  raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
 
283
 
284
+
285
+ # 3) check no duplicated molecules in the predictions file
286
+ if results_dataframe['Molecule Name'].duplicated().any():
287
+ raise gr.Error("The predictions file contains duplicated molecules. Please ensure each molecule is only listed once.")
288
+
289
+ # 4) Merge dataframes to ensure alignment
290
+ merged_df = results_dataframe.merge(
291
+ test_dataframe,
292
+ on="Molecule Name",
293
+ suffixes=('_pred', '_true'),
294
+ how="inner"
295
+ )
296
+ merged_df = merged_df.sort_values("Molecule Name")
297
+
298
+ # 5) loop over endpoints
299
+
300
  final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
301
  all_endpoint_results = []
302
+
303
+ for ept in ENDPOINTS:
304
+ pred_col = f"{ept}_pred"
305
+ true_col = f"{ept}_true"
306
+
307
+ # cast to numeric, coerce errors to NaN
308
+ merged_df[pred_col] = pd.to_numeric(merged_df[pred_col], errors="coerce")
309
+ merged_df[true_col] = pd.to_numeric(merged_df[true_col], errors="coerce")
310
+
311
+ if merged_df[pred_col].isnull().all():
312
+ raise gr.Error(f"All predictions are missing for endpoint {ept}. Please provide valid predictions.")
313
 
314
+ # subset and drop NaNs
315
+ subset = merged_df[[pred_col, true_col]].dropna()
316
+ if subset.empty:
317
+ raise gr.Error(f"No valid data available for endpoint {ept} after removing NaNs.")
318
+
319
+ # extract numpy arrays
320
+ y_pred = subset[pred_col].to_numpy()
321
+ y_true = subset[true_col].to_numpy()
322
+
323
+ # apply log10 + 1 transform except for logD
324
+ if ept.lower() not in ['logd']:
325
+ y_true_log = clip_and_log_transform(y_true)
326
+ y_pred_log = clip_and_log_transform(y_pred)
327
+
328
+ else:
329
+ y_true_log = y_true
330
+ y_pred_log = y_pred
331
+
332
+ # calculate metrics with bootstrapping
333
+ bootstrap_df = bootstrap_metrics(y_pred_log, y_true_log, ept, n_bootstrap_samples=1000)
 
 
 
334
  df_endpoint = bootstrap_df.pivot_table(
335
  index=["Endpoint"],
336
  columns="Metric",
337
  values="Value",
338
  aggfunc=["mean", "std"]
339
  ).reset_index()
340
+
341
  # Get a df with columns 'mean_MAE', 'std_MAE', ...
342
  df_endpoint.columns = [
343
  f'{i}_{j}' if i != '' else j for i, j in df_endpoint.columns
344
  ]
345
+
346
+ df_endpoint.rename(columns={'Endpoint_': 'Endpoint'}, inplace=True)
347
  all_endpoint_results.append(df_endpoint)
348
 
349
  df_results = pd.concat(all_endpoint_results, ignore_index=True)
utils.py CHANGED
@@ -57,11 +57,19 @@ def fetch_dataset_df():
57
  latest.rename(columns={"submission_time": "submission time"}, inplace=True)
58
  return latest
59
 
60
- def convert_to_log(data: pd.Series, multiplier: float) -> pd.Series:
61
- # Add 0.01 to avoid inf
62
- values = np.clip(data, a_min=0.01, a_max=None)
63
- values = values * multiplier # Adjust units
64
- return np.log10(values)
 
 
 
 
 
 
 
 
65
 
66
  def bootstrap_sampling(size: int, n_samples: int) -> np.ndarray:
67
  """
@@ -87,14 +95,12 @@ def metrics_per_ep(pred: np.ndarray,
87
  true: np.ndarray
88
  )->Tuple[float, float, float, float]:
89
  """Predict evaluation metrics for a single sample
90
-
91
  Parameters
92
  ----------
93
  pred : np.ndarray
94
  Array with predictions
95
  true : np.ndarray
96
  Array with actual values
97
-
98
  Returns
99
  -------
100
  Tuple[float, float, float, float]
@@ -108,18 +114,17 @@ def metrics_per_ep(pred: np.ndarray,
108
  r2=np.nan
109
  else:
110
  r2 = r2_score(true, pred)
111
- spr, _ = spearmanr(true, pred)
112
- ktau, _ = kendalltau(true, pred)
113
 
114
  return mae, rae, r2, spr, ktau
115
 
116
  def bootstrap_metrics(pred: np.ndarray,
117
- true: np.ndarray,
118
- endpoint: str,
119
  n_bootstrap_samples=1000
120
  )->pd.DataFrame:
121
  """Calculate bootstrap metrics given predicted and true values
122
-
123
  Parameters
124
  ----------
125
  pred : np.ndarray
@@ -130,7 +135,6 @@ def bootstrap_metrics(pred: np.ndarray,
130
  String with endpoint
131
  n_bootstrap_samples : int, optional
132
  Size of bootstrapsample, by default 1000
133
-
134
  Returns
135
  -------
136
  pd.DataFrame
 
57
  latest.rename(columns={"submission_time": "submission time"}, inplace=True)
58
  return latest
59
 
60
+
61
+ def clip_and_log_transform(y: np.ndarray):
62
+ """
63
+ Clip to a detection limit and transform to log10 scale.
64
+
65
+ Parameters
66
+ ----------
67
+ y : np.ndarray
68
+ The array to be clipped and transformed.
69
+ """
70
+ y = np.clip(y, a_min=0, a_max=None)
71
+ return np.log10(y + 1)
72
+
73
 
74
  def bootstrap_sampling(size: int, n_samples: int) -> np.ndarray:
75
  """
 
95
  true: np.ndarray
96
  )->Tuple[float, float, float, float]:
97
  """Predict evaluation metrics for a single sample
 
98
  Parameters
99
  ----------
100
  pred : np.ndarray
101
  Array with predictions
102
  true : np.ndarray
103
  Array with actual values
 
104
  Returns
105
  -------
106
  Tuple[float, float, float, float]
 
114
  r2=np.nan
115
  else:
116
  r2 = r2_score(true, pred)
117
+ spr = spearmanr(true, pred).statistic
118
+ ktau = kendalltau(true, pred).statistic
119
 
120
  return mae, rae, r2, spr, ktau
121
 
122
  def bootstrap_metrics(pred: np.ndarray,
123
+ true: np.ndarray,
124
+ endpoint: str,
125
  n_bootstrap_samples=1000
126
  )->pd.DataFrame:
127
  """Calculate bootstrap metrics given predicted and true values
 
128
  Parameters
129
  ----------
130
  pred : np.ndarray
 
135
  String with endpoint
136
  n_bootstrap_samples : int, optional
137
  Size of bootstrapsample, by default 1000
 
138
  Returns
139
  -------
140
  pd.DataFrame