Spaces:
Running
Running
| """ | |
| Title: Classification with Neural Decision Forests | |
| Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/) | |
| Date created: 2021/01/15 | |
| Last modified: 2021/01/15 | |
| Description: How to train differentiable decision trees for end-to-end learning in deep neural networks. | |
| Accelerator: GPU | |
| """ | |
| """ | |
| ## Introduction | |
| This example provides an implementation of the | |
| [Deep Neural Decision Forest](https://ieeexplore.ieee.org/document/7410529) | |
| model introduced by P. Kontschieder et al. for structured data classification. | |
| It demonstrates how to build a stochastic and differentiable decision tree model, | |
| train it end-to-end, and unify decision trees with deep representation learning. | |
| ## The dataset | |
| This example uses the | |
| [United States Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/census+income) | |
| provided by the | |
| [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php). | |
| The task is binary classification | |
| to predict whether a person is likely to be making over USD 50,000 a year. | |
| The dataset includes 48,842 instances with 14 input features (such as age, work class, education, occupation, and so on): 5 numerical features | |
| and 9 categorical features. | |
| """ | |
| """ | |
| ## Setup | |
| """ | |
| import keras | |
| from keras import layers | |
| from keras.layers import StringLookup | |
| from keras import ops | |
| from tensorflow import data as tf_data | |
| import numpy as np | |
| import pandas as pd | |
| import math | |
| """ | |
| ## Prepare the data | |
| """ | |
| CSV_HEADER = [ | |
| "age", | |
| "workclass", | |
| "fnlwgt", | |
| "education", | |
| "education_num", | |
| "marital_status", | |
| "occupation", | |
| "relationship", | |
| "race", | |
| "gender", | |
| "capital_gain", | |
| "capital_loss", | |
| "hours_per_week", | |
| "native_country", | |
| "income_bracket", | |
| ] | |
| train_data_url = ( | |
| "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" | |
| ) | |
| train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER) | |
| test_data_url = ( | |
| "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test" | |
| ) | |
| test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER) | |
| print(f"Train dataset shape: {train_data.shape}") | |
| print(f"Test dataset shape: {test_data.shape}") | |
| """ | |
| Remove the first record (because it is not a valid data example) and a trailing | |
| 'dot' in the class labels. | |
| """ | |
| test_data = test_data[1:] | |
| test_data.income_bracket = test_data.income_bracket.apply( | |
| lambda value: value.replace(".", "") | |
| ) | |
| """ | |
| We store the training and test data splits locally as CSV files. | |
| """ | |
| train_data_file = "train_data.csv" | |
| test_data_file = "test_data.csv" | |
| train_data.to_csv(train_data_file, index=False, header=False) | |
| test_data.to_csv(test_data_file, index=False, header=False) | |
| """ | |
| ## Define dataset metadata | |
| Here, we define the metadata of the dataset that will be useful for reading and parsing | |
| and encoding input features. | |
| """ | |
| # A list of the numerical feature names. | |
| NUMERIC_FEATURE_NAMES = [ | |
| "age", | |
| "education_num", | |
| "capital_gain", | |
| "capital_loss", | |
| "hours_per_week", | |
| ] | |
| # A dictionary of the categorical features and their vocabulary. | |
| CATEGORICAL_FEATURES_WITH_VOCABULARY = { | |
| "workclass": sorted(list(train_data["workclass"].unique())), | |
| "education": sorted(list(train_data["education"].unique())), | |
| "marital_status": sorted(list(train_data["marital_status"].unique())), | |
| "occupation": sorted(list(train_data["occupation"].unique())), | |
| "relationship": sorted(list(train_data["relationship"].unique())), | |
| "race": sorted(list(train_data["race"].unique())), | |
| "gender": sorted(list(train_data["gender"].unique())), | |
| "native_country": sorted(list(train_data["native_country"].unique())), | |
| } | |
| # A list of the columns to ignore from the dataset. | |
| IGNORE_COLUMN_NAMES = ["fnlwgt"] | |
| # A list of the categorical feature names. | |
| CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys()) | |
| # A list of all the input features. | |
| FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES | |
| # A list of column default values for each feature. | |
| COLUMN_DEFAULTS = [ | |
| [0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else ["NA"] | |
| for feature_name in CSV_HEADER | |
| ] | |
| # The name of the target feature. | |
| TARGET_FEATURE_NAME = "income_bracket" | |
| # A list of the labels of the target features. | |
| TARGET_LABELS = [" <=50K", " >50K"] | |
| """ | |
| ## Create `tf_data.Dataset` objects for training and validation | |
| We create an input function to read and parse the file, and convert features and labels | |
| into a [`tf_data.Dataset`](https://www.tensorflow.org/guide/datasets) | |
| for training and validation. We also preprocess the input by mapping the target label | |
| to an index. | |
| """ | |
| target_label_lookup = StringLookup( | |
| vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0 | |
| ) | |
| lookup_dict = {} | |
| for feature_name in CATEGORICAL_FEATURE_NAMES: | |
| vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name] | |
| # Create a lookup to convert a string values to an integer indices. | |
| # Since we are not using a mask token, nor expecting any out of vocabulary | |
| # (oov) token, we set mask_token to None and num_oov_indices to 0. | |
| lookup = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0) | |
| lookup_dict[feature_name] = lookup | |
| def encode_categorical(batch_x, batch_y): | |
| for feature_name in CATEGORICAL_FEATURE_NAMES: | |
| batch_x[feature_name] = lookup_dict[feature_name](batch_x[feature_name]) | |
| return batch_x, batch_y | |
| def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128): | |
| dataset = ( | |
| tf_data.experimental.make_csv_dataset( | |
| csv_file_path, | |
| batch_size=batch_size, | |
| column_names=CSV_HEADER, | |
| column_defaults=COLUMN_DEFAULTS, | |
| label_name=TARGET_FEATURE_NAME, | |
| num_epochs=1, | |
| header=False, | |
| na_value="?", | |
| shuffle=shuffle, | |
| ) | |
| .map(lambda features, target: (features, target_label_lookup(target))) | |
| .map(encode_categorical) | |
| ) | |
| return dataset.cache() | |
| """ | |
| ## Create model inputs | |
| """ | |
| def create_model_inputs(): | |
| inputs = {} | |
| for feature_name in FEATURE_NAMES: | |
| if feature_name in NUMERIC_FEATURE_NAMES: | |
| inputs[feature_name] = layers.Input( | |
| name=feature_name, shape=(), dtype="float32" | |
| ) | |
| else: | |
| inputs[feature_name] = layers.Input( | |
| name=feature_name, shape=(), dtype="int32" | |
| ) | |
| return inputs | |
| """ | |
| ## Encode input features | |
| """ | |
| def encode_inputs(inputs): | |
| encoded_features = [] | |
| for feature_name in inputs: | |
| if feature_name in CATEGORICAL_FEATURE_NAMES: | |
| vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name] | |
| # Create a lookup to convert a string values to an integer indices. | |
| # Since we are not using a mask token, nor expecting any out of vocabulary | |
| # (oov) token, we set mask_token to None and num_oov_indices to 0. | |
| value_index = inputs[feature_name] | |
| embedding_dims = int(math.sqrt(lookup.vocabulary_size())) | |
| # Create an embedding layer with the specified dimensions. | |
| embedding = layers.Embedding( | |
| input_dim=lookup.vocabulary_size(), output_dim=embedding_dims | |
| ) | |
| # Convert the index values to embedding representations. | |
| encoded_feature = embedding(value_index) | |
| else: | |
| # Use the numerical features as-is. | |
| encoded_feature = inputs[feature_name] | |
| if inputs[feature_name].shape[-1] is None: | |
| encoded_feature = keras.ops.expand_dims(encoded_feature, -1) | |
| encoded_features.append(encoded_feature) | |
| encoded_features = layers.concatenate(encoded_features) | |
| return encoded_features | |
| """ | |
| ## Deep Neural Decision Tree | |
| A neural decision tree model has two sets of weights to learn. The first set is `pi`, | |
| which represents the probability distribution of the classes in the tree leaves. | |
| The second set is the weights of the routing layer `decision_fn`, which represents the probability | |
| of going to each leave. The forward pass of the model works as follows: | |
| 1. The model expects input `features` as a single vector encoding all the features of an instance | |
| in the batch. This vector can be generated from a Convolution Neural Network (CNN) applied to images | |
| or dense transformations applied to structured data features. | |
| 2. The model first applies a `used_features_mask` to randomly select a subset of input features to use. | |
| 3. Then, the model computes the probabilities (`mu`) for the input instances to reach the tree leaves | |
| by iteratively performing a *stochastic* routing throughout the tree levels. | |
| 4. Finally, the probabilities of reaching the leaves are combined by the class probabilities at the | |
| leaves to produce the final `outputs`. | |
| """ | |
| class NeuralDecisionTree(keras.Model): | |
| def __init__(self, depth, num_features, used_features_rate, num_classes): | |
| super().__init__() | |
| self.depth = depth | |
| self.num_leaves = 2**depth | |
| self.num_classes = num_classes | |
| # Create a mask for the randomly selected features. | |
| num_used_features = int(num_features * used_features_rate) | |
| one_hot = np.eye(num_features) | |
| sampled_feature_indices = np.random.choice( | |
| np.arange(num_features), num_used_features, replace=False | |
| ) | |
| self.used_features_mask = ops.convert_to_tensor( | |
| one_hot[sampled_feature_indices], dtype="float32" | |
| ) | |
| # Initialize the weights of the classes in leaves. | |
| self.pi = self.add_weight( | |
| initializer="random_normal", | |
| shape=[self.num_leaves, self.num_classes], | |
| dtype="float32", | |
| trainable=True, | |
| ) | |
| # Initialize the stochastic routing layer. | |
| self.decision_fn = layers.Dense( | |
| units=self.num_leaves, activation="sigmoid", name="decision" | |
| ) | |
| def call(self, features): | |
| batch_size = ops.shape(features)[0] | |
| # Apply the feature mask to the input features. | |
| features = ops.matmul( | |
| features, ops.transpose(self.used_features_mask) | |
| ) # [batch_size, num_used_features] | |
| # Compute the routing probabilities. | |
| decisions = ops.expand_dims( | |
| self.decision_fn(features), axis=2 | |
| ) # [batch_size, num_leaves, 1] | |
| # Concatenate the routing probabilities with their complements. | |
| decisions = layers.concatenate( | |
| [decisions, 1 - decisions], axis=2 | |
| ) # [batch_size, num_leaves, 2] | |
| mu = ops.ones([batch_size, 1, 1]) | |
| begin_idx = 1 | |
| end_idx = 2 | |
| # Traverse the tree in breadth-first order. | |
| for level in range(self.depth): | |
| mu = ops.reshape(mu, [batch_size, -1, 1]) # [batch_size, 2 ** level, 1] | |
| mu = ops.tile(mu, (1, 1, 2)) # [batch_size, 2 ** level, 2] | |
| level_decisions = decisions[ | |
| :, begin_idx:end_idx, : | |
| ] # [batch_size, 2 ** level, 2] | |
| mu = mu * level_decisions # [batch_size, 2**level, 2] | |
| begin_idx = end_idx | |
| end_idx = begin_idx + 2 ** (level + 1) | |
| mu = ops.reshape(mu, [batch_size, self.num_leaves]) # [batch_size, num_leaves] | |
| probabilities = keras.activations.softmax(self.pi) # [num_leaves, num_classes] | |
| outputs = ops.matmul(mu, probabilities) # [batch_size, num_classes] | |
| return outputs | |
| """ | |
| ## Deep Neural Decision Forest | |
| The neural decision forest model consists of a set of neural decision trees that are | |
| trained simultaneously. The output of the forest model is the average outputs of its trees. | |
| """ | |
| class NeuralDecisionForest(keras.Model): | |
| def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes): | |
| super().__init__() | |
| self.ensemble = [] | |
| # Initialize the ensemble by adding NeuralDecisionTree instances. | |
| # Each tree will have its own randomly selected input features to use. | |
| for _ in range(num_trees): | |
| self.ensemble.append( | |
| NeuralDecisionTree(depth, num_features, used_features_rate, num_classes) | |
| ) | |
| def call(self, inputs): | |
| # Initialize the outputs: a [batch_size, num_classes] matrix of zeros. | |
| batch_size = ops.shape(inputs)[0] | |
| outputs = ops.zeros([batch_size, num_classes]) | |
| # Aggregate the outputs of trees in the ensemble. | |
| for tree in self.ensemble: | |
| outputs += tree(inputs) | |
| # Divide the outputs by the ensemble size to get the average. | |
| outputs /= len(self.ensemble) | |
| return outputs | |
| """ | |
| Finally, let's set up the code that will train and evaluate the model. | |
| """ | |
| learning_rate = 0.01 | |
| batch_size = 265 | |
| num_epochs = 10 | |
| def run_experiment(model): | |
| model.compile( | |
| optimizer=keras.optimizers.Adam(learning_rate=learning_rate), | |
| loss=keras.losses.SparseCategoricalCrossentropy(), | |
| metrics=[keras.metrics.SparseCategoricalAccuracy()], | |
| ) | |
| print("Start training the model...") | |
| train_dataset = get_dataset_from_csv( | |
| train_data_file, shuffle=True, batch_size=batch_size | |
| ) | |
| model.fit(train_dataset, epochs=num_epochs) | |
| print("Model training finished") | |
| print("Evaluating the model on the test data...") | |
| test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size) | |
| _, accuracy = model.evaluate(test_dataset) | |
| print(f"Test accuracy: {round(accuracy * 100, 2)}%") | |
| """ | |
| ## Experiment 1: train a decision tree model | |
| In this experiment, we train a single neural decision tree model | |
| where we use all input features. | |
| """ | |
| num_trees = 10 | |
| depth = 10 | |
| used_features_rate = 1.0 | |
| num_classes = len(TARGET_LABELS) | |
| def create_tree_model(): | |
| inputs = create_model_inputs() | |
| features = encode_inputs(inputs) | |
| features = layers.BatchNormalization()(features) | |
| num_features = features.shape[1] | |
| tree = NeuralDecisionTree(depth, num_features, used_features_rate, num_classes) | |
| outputs = tree(features) | |
| model = keras.Model(inputs=inputs, outputs=outputs) | |
| return model | |
| tree_model = create_tree_model() | |
| run_experiment(tree_model) | |
| """ | |
| ## Experiment 2: train a forest model | |
| In this experiment, we train a neural decision forest with `num_trees` trees | |
| where each tree uses randomly selected 50% of the input features. You can control the number | |
| of features to be used in each tree by setting the `used_features_rate` variable. | |
| In addition, we set the depth to 5 instead of 10 compared to the previous experiment. | |
| """ | |
| num_trees = 25 | |
| depth = 5 | |
| used_features_rate = 0.5 | |
| def create_forest_model(): | |
| inputs = create_model_inputs() | |
| features = encode_inputs(inputs) | |
| features = layers.BatchNormalization()(features) | |
| num_features = features.shape[1] | |
| forest_model = NeuralDecisionForest( | |
| num_trees, depth, num_features, used_features_rate, num_classes | |
| ) | |
| outputs = forest_model(features) | |
| model = keras.Model(inputs=inputs, outputs=outputs) | |
| return model | |
| forest_model = create_forest_model() | |
| run_experiment(forest_model) | |