File size: 2,003 Bytes
484e3bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
"""
Feature discovery using Random Forests and other methods.
"""
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
class FeatureDiscovery:
"""
Discover important features and relationships in geopolitical data.
"""
def __init__(self):
"""Initialize feature discovery."""
self.feature_scores = {}
def discover_important_features(
self,
X: pd.DataFrame,
y: np.ndarray,
n_top: int = 10
) -> List[Tuple[str, float]]:
"""
Discover most important features using Random Forest.
Parameters
----------
X : pd.DataFrame
Feature matrix
y : np.ndarray
Target variable
n_top : int
Number of top features to return
Returns
-------
list
List of (feature_name, importance_score) tuples
"""
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)
importance = model.feature_importances_
self.feature_scores = dict(zip(X.columns, importance))
sorted_features = sorted(
self.feature_scores.items(),
key=lambda x: x[1],
reverse=True
)
return sorted_features[:n_top]
def discover_latent_factors(
self,
X: pd.DataFrame,
n_components: int = 5
) -> Tuple[np.ndarray, np.ndarray]:
"""
Discover latent factors using PCA.
Parameters
----------
X : pd.DataFrame
Feature matrix
n_components : int
Number of components
Returns
-------
tuple
(transformed_data, explained_variance)
"""
pca = PCA(n_components=n_components)
transformed = pca.fit_transform(X)
return transformed, pca.explained_variance_ratio_
|