|
|
""" |
|
|
Risk scoring using gradient boosting and ensemble methods. |
|
|
""" |
|
|
|
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from typing import Dict, List, Optional, Any, Tuple |
|
|
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier |
|
|
from sklearn.model_selection import cross_val_score |
|
|
|
|
|
|
|
|
class RiskScorer: |
|
|
""" |
|
|
Nonlinear risk scoring using ensemble methods. |
|
|
|
|
|
Uses Gradient Boosting and Random Forests to discover nonlinear |
|
|
patterns in geopolitical risk factors. |
|
|
""" |
|
|
|
|
|
def __init__(self, method: str = 'gradient_boosting'): |
|
|
""" |
|
|
Initialize risk scorer. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
method : str |
|
|
Method to use ('gradient_boosting', 'random_forest', 'ensemble') |
|
|
""" |
|
|
self.method = method |
|
|
self.model = None |
|
|
self.feature_names = None |
|
|
self.is_trained = False |
|
|
|
|
|
def train( |
|
|
self, |
|
|
X: pd.DataFrame, |
|
|
y: np.ndarray, |
|
|
n_estimators: int = 100, |
|
|
max_depth: int = 5 |
|
|
) -> Dict[str, Any]: |
|
|
""" |
|
|
Train risk scoring model. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
X : pd.DataFrame |
|
|
Feature matrix |
|
|
y : np.ndarray |
|
|
Risk labels (0 = low risk, 1 = high risk) |
|
|
n_estimators : int |
|
|
Number of estimators |
|
|
max_depth : int |
|
|
Maximum tree depth |
|
|
|
|
|
Returns |
|
|
------- |
|
|
dict |
|
|
Training results |
|
|
""" |
|
|
self.feature_names = X.columns.tolist() |
|
|
|
|
|
if self.method == 'gradient_boosting': |
|
|
self.model = GradientBoostingClassifier( |
|
|
n_estimators=n_estimators, |
|
|
max_depth=max_depth, |
|
|
random_state=42 |
|
|
) |
|
|
elif self.method == 'random_forest': |
|
|
self.model = RandomForestClassifier( |
|
|
n_estimators=n_estimators, |
|
|
max_depth=max_depth, |
|
|
random_state=42 |
|
|
) |
|
|
else: |
|
|
raise ValueError(f"Unknown method: {self.method}") |
|
|
|
|
|
|
|
|
self.model.fit(X, y) |
|
|
self.is_trained = True |
|
|
|
|
|
|
|
|
cv_scores = cross_val_score(self.model, X, y, cv=5) |
|
|
|
|
|
return { |
|
|
'cv_mean': cv_scores.mean(), |
|
|
'cv_std': cv_scores.std(), |
|
|
'feature_importance': self.get_feature_importance() |
|
|
} |
|
|
|
|
|
def predict_risk(self, X: pd.DataFrame) -> np.ndarray: |
|
|
""" |
|
|
Predict risk scores. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
X : pd.DataFrame |
|
|
Features |
|
|
|
|
|
Returns |
|
|
------- |
|
|
np.ndarray |
|
|
Risk probabilities |
|
|
""" |
|
|
if not self.is_trained: |
|
|
raise ValueError("Model not trained yet") |
|
|
|
|
|
return self.model.predict_proba(X)[:, 1] |
|
|
|
|
|
def get_feature_importance(self) -> Dict[str, float]: |
|
|
""" |
|
|
Get feature importance scores. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
dict |
|
|
Feature importance |
|
|
""" |
|
|
if not self.is_trained: |
|
|
raise ValueError("Model not trained yet") |
|
|
|
|
|
importance = self.model.feature_importances_ |
|
|
return dict(zip(self.feature_names, importance)) |
|
|
|
|
|
def explain_prediction(self, X: pd.DataFrame, index: int) -> Dict[str, Any]: |
|
|
""" |
|
|
Explain a specific prediction. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
X : pd.DataFrame |
|
|
Features |
|
|
index : int |
|
|
Index of sample to explain |
|
|
|
|
|
Returns |
|
|
------- |
|
|
dict |
|
|
Explanation |
|
|
""" |
|
|
if not self.is_trained: |
|
|
raise ValueError("Model not trained yet") |
|
|
|
|
|
sample = X.iloc[index:index+1] |
|
|
risk_score = self.predict_risk(sample)[0] |
|
|
|
|
|
|
|
|
feature_values = sample.iloc[0].to_dict() |
|
|
feature_importance = self.get_feature_importance() |
|
|
|
|
|
contributions = { |
|
|
feat: feature_values[feat] * feature_importance[feat] |
|
|
for feat in feature_values.keys() |
|
|
} |
|
|
|
|
|
return { |
|
|
'risk_score': risk_score, |
|
|
'top_risk_factors': sorted(contributions.items(), key=lambda x: x[1], reverse=True)[:5], |
|
|
'feature_values': feature_values |
|
|
} |
|
|
|