Spaces:

openadmet
/

OpenADMET-ExpansionRx-Challenge

Running

App Files Files Community

hmacdope-omsf

hmacdope commited on Oct 8

Commit

986648a

verified ·

1 Parent(s): be49025

Update Eval (#2)

Browse files

- update eval (d485d48fddb6008589f34301c05c8d8eadcf7d4f)

Co-authored-by: Hugo MacDermott-Opeskin <hmacdope@users.noreply.huggingface.co>

Files changed (2) hide show

evaluate.py +55 -40
utils.py +17 -13

evaluate.py CHANGED Viewed

@@ -7,9 +7,8 @@ from about import (
     submissions_repo,
     results_repo,
     test_repo,
-    multiplier_dict,
 )
-from utils import bootstrap_metrics, convert_to_log
 from huggingface_hub import hf_hub_download
 import datetime
 import io
@@ -252,6 +251,7 @@ def evaluate_data(filename: str) -> None:
     Path(tmp_name).unlink()
 def calculate_metrics(
         results_dataframe: pd.DataFrame,
         test_dataframe: pd.DataFrame
@@ -263,60 +263,75 @@ def calculate_metrics(
     # 1) Check all columns are present
     _check_required_columns(results_dataframe, "Results file", ["Molecule Name"] + ENDPOINTS)
     _check_required_columns(test_dataframe, "Test file", ["Molecule Name"] + ENDPOINTS)
     # 2) Check all Molecules in the test set are present in the predictions
-    merged_df = pd.merge(test_dataframe, results_dataframe, on=['Molecule Name'], how='left', indicator=True)
-    if not (merged_df['_merge'] == 'both').all():
         raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
-    # TODO: What to do when a molecule is duplicated in the Predictions file?
-    # Compute leaderboard DataFrame
     final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
     all_endpoint_results = []
-    for i, measurement in enumerate(ENDPOINTS):
-        df_pred = results_dataframe[['Molecule Name', measurement]].copy()
-        df_true = test_dataframe[['Molecule Name', measurement]].copy()
-        # coerce numeric columns
-        df_pred[measurement] = pd.to_numeric(df_pred[measurement], errors="coerce")
-        df_true[measurement] = pd.to_numeric(df_true[measurement], errors="coerce")
-        if df_pred[measurement].isnull().all():
-            # TODO: Allow missing endpoints or raise an error?
-            raise gr.Error(f"All predictions are missing for endpoint {measurement}. Please provide valid predictions.")
-        # Drop NaNs and calculate coverage
-        merged = (
-            df_pred.rename(columns={measurement: f"{measurement}_pred"})
-                .merge(
-                    df_true.rename(columns={measurement: f"{measurement}_true"}),
-                    on="Molecule Name",
-                    how="inner",
-                )
-                .dropna(subset=[f"{measurement}_pred", f"{measurement}_true"])
-        )
-        merged = merged.sort_values("Molecule Name", kind="stable")
-        pred_col = f"{measurement}_pred"
-        true_col = f"{measurement}_true"
-        if measurement not in ['logD']:
-            # Force log scale for all endpoints except LogD (for outliers)
-            merged[pred_col] = convert_to_log(merged[pred_col], multiplier_dict.get(measurement, 1)).to_numpy()
-            merged[true_col]  = convert_to_log(merged[true_col], multiplier_dict.get(measurement, 1)).to_numpy()
-        y_pred = merged[pred_col].to_numpy()
-        y_true = merged[true_col].to_numpy()
-        # Calculate dataframe with the metrics for 1000 bootstraps
-        bootstrap_df = bootstrap_metrics(y_pred, y_true, measurement, n_bootstrap_samples=1000)
         df_endpoint = bootstrap_df.pivot_table(
             index=["Endpoint"],
             columns="Metric",
             values="Value",
             aggfunc=["mean", "std"]
         ).reset_index()
         # Get a df with columns 'mean_MAE', 'std_MAE', ...
         df_endpoint.columns = [
             f'{i}_{j}' if i != '' else j for i, j in df_endpoint.columns
         ]
-        df_endpoint.rename(columns={'Endpoint_': 'Endpoint'}, inplace=True)
         all_endpoint_results.append(df_endpoint)
     df_results = pd.concat(all_endpoint_results, ignore_index=True)

     submissions_repo,
     results_repo,
     test_repo,
 )
+from utils import bootstrap_metrics, clip_and_log_transform
 from huggingface_hub import hf_hub_download
 import datetime
 import io
     Path(tmp_name).unlink()
 def calculate_metrics(
         results_dataframe: pd.DataFrame,
         test_dataframe: pd.DataFrame
     # 1) Check all columns are present
     _check_required_columns(results_dataframe, "Results file", ["Molecule Name"] + ENDPOINTS)
     _check_required_columns(test_dataframe, "Test file", ["Molecule Name"] + ENDPOINTS)
     # 2) Check all Molecules in the test set are present in the predictions
+    if not (results_dataframe['Molecule Name'].isin(test_dataframe['Molecule Name'])).all():
         raise gr.Error("The predictions file is missing some molecules present in the test set. Please ensure all molecules are included.")
+    # 3) check no duplicated molecules in the predictions file
+    if results_dataframe['Molecule Name'].duplicated().any():
+        raise gr.Error("The predictions file contains duplicated molecules. Please ensure each molecule is only listed once.")
+    # 4) Merge dataframes to ensure alignment
+    merged_df = results_dataframe.merge(
+        test_dataframe,
+        on="Molecule Name",
+        suffixes=('_pred', '_true'),
+        how="inner"
+    )
+    merged_df = merged_df.sort_values("Molecule Name")
+    # 5) loop over endpoints
     final_cols = ["MAE", "RAE", "R2", "Spearman R", "Kendall's Tau"]
     all_endpoint_results = []
+    for ept in ENDPOINTS:
+        pred_col = f"{ept}_pred"
+        true_col = f"{ept}_true"
+        # cast to numeric, coerce errors to NaN
+        merged_df[pred_col] = pd.to_numeric(merged_df[pred_col], errors="coerce")
+        merged_df[true_col] = pd.to_numeric(merged_df[true_col], errors="coerce")
+        if merged_df[pred_col].isnull().all():
+            raise gr.Error(f"All predictions are missing for endpoint {ept}. Please provide valid predictions.")
+        # subset and drop NaNs
+        subset = merged_df[[pred_col, true_col]].dropna()
+        if subset.empty:
+            raise gr.Error(f"No valid data available for endpoint {ept} after removing NaNs.")
+        # extract numpy arrays
+        y_pred = subset[pred_col].to_numpy()
+        y_true = subset[true_col].to_numpy()
+        # apply log10 + 1 transform except for logD
+        if ept.lower() not in ['logd']:
+            y_true_log = clip_and_log_transform(y_true)
+            y_pred_log = clip_and_log_transform(y_pred)
+        else:
+            y_true_log = y_true
+            y_pred_log = y_pred
+        # calculate metrics with bootstrapping
+        bootstrap_df = bootstrap_metrics(y_pred_log, y_true_log, ept, n_bootstrap_samples=1000)
         df_endpoint = bootstrap_df.pivot_table(
             index=["Endpoint"],
             columns="Metric",
             values="Value",
             aggfunc=["mean", "std"]
         ).reset_index()
         # Get a df with columns 'mean_MAE', 'std_MAE', ...
         df_endpoint.columns = [
             f'{i}_{j}' if i != '' else j for i, j in df_endpoint.columns
         ]
+        df_endpoint.rename(columns={'Endpoint_': 'Endpoint'}, inplace=True)
         all_endpoint_results.append(df_endpoint)
     df_results = pd.concat(all_endpoint_results, ignore_index=True)

utils.py CHANGED Viewed

@@ -57,11 +57,19 @@ def fetch_dataset_df():
     latest.rename(columns={"submission_time": "submission time"}, inplace=True)
     return latest
-def convert_to_log(data: pd.Series, multiplier: float) -> pd.Series:
-    # Add 0.01 to avoid inf
-    values = np.clip(data, a_min=0.01, a_max=None)
-    values = values * multiplier # Adjust units
-    return np.log10(values)
 def bootstrap_sampling(size: int, n_samples: int) -> np.ndarray:
     """
@@ -87,14 +95,12 @@ def metrics_per_ep(pred: np.ndarray,
                    true: np.ndarray
     )->Tuple[float, float, float, float]:
     """Predict evaluation metrics for a single sample
     Parameters
     ----------
     pred : np.ndarray
         Array with predictions
     true : np.ndarray
         Array with actual values
     Returns
     -------
     Tuple[float, float, float, float]
@@ -108,18 +114,17 @@ def metrics_per_ep(pred: np.ndarray,
         r2=np.nan
     else:
         r2 = r2_score(true, pred)
-    spr, _ = spearmanr(true, pred)
-    ktau, _ = kendalltau(true, pred)
     return mae, rae, r2, spr, ktau
 def bootstrap_metrics(pred: np.ndarray,
-                      true: np.ndarray,
-                      endpoint: str,
                       n_bootstrap_samples=1000
     )->pd.DataFrame:
     """Calculate bootstrap metrics given predicted and true values
     Parameters
     ----------
     pred : np.ndarray
@@ -130,7 +135,6 @@ def bootstrap_metrics(pred: np.ndarray,
         String with endpoint
     n_bootstrap_samples : int, optional
         Size of bootstrapsample, by default 1000
     Returns
     -------
     pd.DataFrame

     latest.rename(columns={"submission_time": "submission time"}, inplace=True)
     return latest
+def clip_and_log_transform(y: np.ndarray):
+    """
+    Clip to a detection limit and transform to log10 scale.
+    Parameters
+    ----------
+    y : np.ndarray
+        The array to be clipped and transformed.
+    """
+    y = np.clip(y, a_min=0, a_max=None)
+    return np.log10(y + 1)
 def bootstrap_sampling(size: int, n_samples: int) -> np.ndarray:
     """
                    true: np.ndarray
     )->Tuple[float, float, float, float]:
     """Predict evaluation metrics for a single sample
     Parameters
     ----------
     pred : np.ndarray
         Array with predictions
     true : np.ndarray
         Array with actual values
     Returns
     -------
     Tuple[float, float, float, float]
         r2=np.nan
     else:
         r2 = r2_score(true, pred)
+    spr = spearmanr(true, pred).statistic
+    ktau = kendalltau(true, pred).statistic
     return mae, rae, r2, spr, ktau
 def bootstrap_metrics(pred: np.ndarray,
+                      true: np.ndarray,
+                      endpoint: str,
                       n_bootstrap_samples=1000
     )->pd.DataFrame:
     """Calculate bootstrap metrics given predicted and true values
     Parameters
     ----------
     pred : np.ndarray
         String with endpoint
     n_bootstrap_samples : int, optional
         Size of bootstrapsample, by default 1000
     Returns
     -------
     pd.DataFrame