Ensemble decision trees in Numba
Representing ensembles of decision trees using numpy arrays with fast numba operations.
import math
import numpy as np
from numba import njit, prange
@njit
def take(X, inds):
"""Multidimensional indexing for numba"""
n = len(X)
y = np.zeros(n, dtype=X.dtype)
for i in range(n):
y[i] = X[i, inds[i]]
return y
@njit
def next_node(node_id, value, thr):
"""A vectorized operation to find the next node in a binary tree given a
value and threshold.
Args:
node_id: int or array of the current node, root node_id = 0
Return:
new node id, left node when value <= thr, right node when value > thr
"""
return (node_id << 1) + 1 + (1 * (thr < value))
@njit
def leaf(X, features, thresholds, reset_leaf_index=1):
"""Find the leaf node index along a decision path given a tree feature
indices, thresholds, and design matrix.
Args:
X: 2D design matrix of shape [nsamples, nfeatures]
features: feature indices as dtype np.int32 of shape [internal_nodes]
thresholds: split thresholds as dtype np.float64 of shape [internal_nodes]
reset_leaf_index: returns leaf index initialized from 1
"""
nsamples = len(X)
node_id = np.zeros(nsamples, dtype=np.int64)
n = len(features)
depth = int(math.log(n+1)/math.log(2))
internal_nodes = 2**(depth) - 1
for i in range(depth):
feature_ind = features[node_id]
value = take(X, feature_ind)
thr = thresholds[node_id]
node_id = next_node(node_id, value, thr)
if reset_leaf_index == 1:
node_id = node_id - internal_nodes
return node_id
@njit
def leaf_tokens(X, trees, nleaves_per_tree):
"""Tokenize a design matrix with leaf indices.
Args:
X: 2D design matrix of shape [nsamples, nfeatures]
trees: 3D matrix deifining an ensemble of decision trees of shape
[ntrees, internal_nodes, 2]
nleaves_per_tree: number of leaves in each decision tree, 2**depth
"""
nsamples = X.shape[0]
ntrees = trees.shape[0]
leaves = np.zeros((nsamples,ntrees), dtype=np.int64)
for i in prange(0,ntrees):
features = trees[i,:,0].astype(np.int64)
thresholds = trees[i,:,1]
leaves[:,i] = leaf(X, features, thresholds) + nleaves_per_tree * i
return leaves
@njit
def random_ensemble_decision_trees(ntrees, depth, nfeatures):
"""Generate a random ensemble of decision trees
Args:
ntrees: number of trees in ensemble
depth: height of each tree
nfeatures: number of features in design matrix
Returns:
trees: 3D matrix of shape [ntrees, internal_nodes, 2] where internal_nodes
is the number of non-leaf nodes (including root node) calculated
as 2^{depth} - 1 and the last dimension includes feature index and
splitting thresholds
"""
internal_nodes = 2**(depth) - 1
total_internal_nodes = ntrees * internal_nodes
trees = np.zeros((total_internal_nodes, 2)) # [features, thresholds]
trees[:,0] = np.random.randint(0, nfeatures, total_internal_nodes)
trees[:,1] = np.random.rand(total_internal_nodes)
trees = trees.reshape(ntrees, internal_nodes, -1) # [ntrees, internal_nodes, 2]
return trees
nsamples, nfeatures = 1000, 30
X = np.random.rand(nsamples,nfeatures)
ntrees = 1000
depth = 1
nleaves = 2**depth
total_nleaves = ntrees * nleaves
trees = random_ensemble_decision_trees(ntrees=ntrees, depth=depth, nfeatures=nfeatures)
leaves = leaf_tokens(X, trees, nleaves)