""" Feature discovery using Random Forests and other methods. """ import numpy as np import pandas as pd from typing import Dict, List, Tuple from sklearn.ensemble import RandomForestRegressor from sklearn.decomposition import PCA class FeatureDiscovery: """ Discover important features and relationships in geopolitical data. """ def __init__(self): """Initialize feature discovery.""" self.feature_scores = {} def discover_important_features( self, X: pd.DataFrame, y: np.ndarray, n_top: int = 10 ) -> List[Tuple[str, float]]: """ Discover most important features using Random Forest. Parameters ---------- X : pd.DataFrame Feature matrix y : np.ndarray Target variable n_top : int Number of top features to return Returns ------- list List of (feature_name, importance_score) tuples """ model = RandomForestRegressor(n_estimators=100, random_state=42) model.fit(X, y) importance = model.feature_importances_ self.feature_scores = dict(zip(X.columns, importance)) sorted_features = sorted( self.feature_scores.items(), key=lambda x: x[1], reverse=True ) return sorted_features[:n_top] def discover_latent_factors( self, X: pd.DataFrame, n_components: int = 5 ) -> Tuple[np.ndarray, np.ndarray]: """ Discover latent factors using PCA. Parameters ---------- X : pd.DataFrame Feature matrix n_components : int Number of components Returns ------- tuple (transformed_data, explained_variance) """ pca = PCA(n_components=n_components) transformed = pca.fit_transform(X) return transformed, pca.explained_variance_ratio_