KPLabs
/

HYPERVIEW-RandomForest

Joblib

Model card Files Files and versions

xet

Community

KPLabs commited on Nov 3, 2025

Commit

ed46d2f

verified ·

1 Parent(s): bf2b411

Create train.py

Browse files

Files changed (1) hide show

train.py +210 -0

train.py ADDED Viewed

	@@ -0,0 +1,210 @@

+from utils import calculate_metrics, get_classes, CLASSES
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.utils import shuffle
+from typing import List, Tuple
+from sklearn import metrics
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+import joblib
+import pywt
+import fire
+import json
+import os
+TRAIN_SIZE = 1732
+TEST_SIZE = 1154
+TRAIN_DIR = "train_data_simulated/"
+TEST_DIR = "test_data_simulated/"
+def load_data() -> Tuple[List, pd.DataFrame, List, pd.DataFrame]:
+    X_train = [os.path.join(TRAIN_DIR, f"{i}.npz") for i in range(TRAIN_SIZE)]
+    X_test = [os.path.join(TEST_DIR, f"{i}.npz") for i in range(TEST_SIZE)]
+    y_train = pd.read_csv("train_gt.csv")
+    y_test = pd.read_csv("test_gt.csv")
+    return X_train, y_train, X_test, y_test
+class SpectralCurveFiltering:
+    def __init__(self, merge_function=np.mean):
+        self.merge_function = merge_function
+    def __call__(self, sample: np.ndarray) -> np.ndarray:
+        return self.merge_function(sample, axis=(1, 2))
+class BaselineRegressor:
+    def __init__(self):
+        self.mean = 0
+    def fit(self, X_train: np.ndarray, y_train: np.ndarray):
+        self.mean = np.mean(y_train, axis=0)
+        self.classes_count = y_train.shape[1]
+        return self
+    def predict(self, X_test: np.ndarray) -> np.ndarray:
+        return np.full((len(X_test), self.classes_count), self.mean)
+def preprocess(samples_lst: List[str], features: List[str]) -> Tuple:
+    def _shape_pad(data: np.ndarray) -> np.ndarray:
+        """
+        This sub-function makes padding to have square fields sizes.
+        Not mandatory but eliminates the risk of calculation error
+        in singular value decomposition.
+        Padding by warping also improves the performance slightly.
+        """
+        max_edge = np.max(data.shape[1:])
+        shape = (max_edge, max_edge)
+        padded = np.pad(
+            data,
+            ((0, 0), (0, (shape[0] - data.shape[1])), (0, (shape[1] - data.shape[2]))),
+            "wrap",
+        )
+        return padded
+    filtering = SpectralCurveFiltering()
+    w1 = pywt.Wavelet("sym3")
+    w2 = pywt.Wavelet("dmey")
+    all_feature_names = []
+    for sample_index, sample_path in tqdm(
+        enumerate(samples_lst), total=len(samples_lst)
+    ):
+        with np.load(sample_path) as npz:
+            data = np.ma.MaskedArray(**npz)
+            data = _shape_pad(data)
+            # Get the spatial features:
+            s = np.linalg.svd(data, full_matrices=False, compute_uv=False)
+            s0 = s[:, 0]
+            s1 = s[:, 1]
+            s2 = s[:, 2]
+            s3 = s[:, 3]
+            s4 = s[:, 4]
+            dXds1 = s0 / (s1 + np.finfo(float).eps)
+            ffts = np.fft.fft(s0)
+            reals = np.real(ffts)
+            imags = np.imag(ffts)
+            # Get the specific spectral features:
+            data = filtering(data)
+            cA0, cD0 = pywt.dwt(data, wavelet=w2, mode="constant")
+            cAx, cDx = pywt.dwt(cA0[12:92], wavelet=w2, mode="constant")
+            cAy, cDy = pywt.dwt(cAx[15:55], wavelet=w2, mode="constant")
+            cAz, cDz = pywt.dwt(cAy[15:35], wavelet=w2, mode="constant")
+            cAw2 = np.concatenate((cA0[12:92], cAx[15:55], cAy[15:35], cAz[15:25]), -1)
+            cDw2 = np.concatenate((cD0[12:92], cDx[15:55], cDy[15:35], cDz[15:25]), -1)
+            cA0, cD0 = pywt.dwt(data, wavelet=w1, mode="constant")
+            cAx, cDx = pywt.dwt(cA0[1:-1], wavelet=w1, mode="constant")
+            cAy, cDy = pywt.dwt(cAx[1:-1], wavelet=w1, mode="constant")
+            cAz, cDz = pywt.dwt(cAy[1:-1], wavelet=w1, mode="constant")
+            cAw1 = np.concatenate((cA0, cAx, cAy, cAz), -1)
+            cDw1 = np.concatenate((cD0, cDx, cDy, cDz), -1)
+            dXdl = np.gradient(data, axis=0)
+            d2Xdl2 = np.gradient(dXdl, axis=0)
+            d3Xdl3 = np.gradient(d2Xdl2, axis=0)
+            fft = np.fft.fft(data)
+            real = np.real(fft)
+            imag = np.imag(fft)
+            features_to_select = {
+                "spatial": (dXds1, s0, s1, s2, s3, s4, reals, imags),
+                "fft": (real, imag),
+                "gradient": (dXdl, d2Xdl2, d3Xdl3),
+                "mean": (data,),
+                "dwt": (cAw1, cAw2),
+            }
+            # The best Feature combination for Random Forest based regression:
+            sample_features = []
+            sample_feature_names = []
+            for feature_name in features:
+                sample_features.extend(features_to_select[feature_name])
+                sample_feature_names.extend(
+                    [feature_name]
+                    * len(np.concatenate(features_to_select[feature_name]))
+                )
+            sample_features = np.concatenate(sample_features, -1)
+            samples_lst[sample_index] = sample_features
+            all_feature_names.append(sample_feature_names)
+    return np.vstack(samples_lst), all_feature_names
+def runner(features: List[str] = "spatial,fft,dwt,gradient,mean".split(",")):
+    X_train, y_train, X_test, y_test = load_data()
+    X_train, train_feature_names = preprocess(X_train, features)
+    X_test, test_feature_names = preprocess(X_test, features)
+    X_train, y_train = shuffle(X_train, y_train, random_state=2023)
+    model = RandomForestRegressor(random_state=2023)
+    print(f"Training model on {X_train.shape} features...")
+    model = model.fit(X_train, y_train[CLASSES].values)
+    joblib.dump(model, f"RF_model_{'-'.join(features)}.joblib")
+    submission_df = pd.DataFrame(data=model.predict(X_test), columns=CLASSES)
+    submission_df.to_csv(",".join(features) + ".csv", index_label="sample_index")
+    baseline_reg = BaselineRegressor()
+    baseline_reg = baseline_reg.fit(X_train, y_train[CLASSES].values)
+    baselines_mse = np.mean(
+        (y_test[CLASSES].values - baseline_reg.predict(X_test)) ** 2, axis=0
+    )
+    mse = np.mean((y_test[CLASSES].values - submission_df[CLASSES].values) ** 2, axis=0)
+    scores = mse / baselines_mse
+    final_score = np.mean(scores)
+    r2 = metrics.r2_score(
+        y_true=y_test[CLASSES].values,
+        y_pred=submission_df[CLASSES].values,
+        multioutput="raw_values",
+    )
+    mse = metrics.mean_squared_error(
+        y_true=y_test[CLASSES].values,
+        y_pred=submission_df[CLASSES].values,
+        multioutput="raw_values",
+    )
+    mae = metrics.mean_absolute_error(
+        y_true=y_test[CLASSES].values,
+        y_pred=submission_df[CLASSES].values,
+        multioutput="raw_values",
+    )
+    all_metrics = calculate_metrics(
+        y_pred=get_classes(submission_df[CLASSES]),
+        y_true=get_classes(y_test[CLASSES]),
+    )
+    mse = {k + "_mse": v for k, v in zip(["P", "K", "Mg", "pH"], mse.tolist())}
+    r2 = {k + "_r2": v for k, v in zip(["P", "K", "Mg", "pH"], r2.tolist())}
+    mae = {k + "_mae": v for k, v in zip(["P", "K", "Mg", "pH"], mae.tolist())}
+    all_metrics["custom"] = final_score
+    all_metrics = pd.DataFrame.from_dict({**all_metrics, **r2, **mse, **mae})
+    all_metrics.to_csv(f"all_metrics.csv", index=False)
+    with open("all_metrics.json", "w", encoding="utf-8") as f:
+        json.dump(all_metrics.to_dict(), f, ensure_ascii=True, indent=4)
+    print(f"Custom score: {final_score}")
+    return final_score
+if __name__ == "__main__":
+    fire.Fire(runner)
+    model = joblib.load(
+        f"RF_model_{'-'.join('spatial,fft,dwt,gradient,mean'.split(','))}.joblib"
+    )
+    import sklearn
+    assert isinstance(model, sklearn.ensemble._forest.RandomForestRegressor)