Ensemble decision trees in Numba
Representing ensembles of decision trees using numpy arrays with fast numba operations.
Representing ensembles of decision trees using numpy arrays with fast numba operations.
import math
import numpy as np
from numba import njit, prange
@njit
def take(X, inds):
"""Multidimensional indexing for numba"""
n = len(X)
y = np.zeros(n, dtype=X.dtype)
for i in range(n):
y[i] = X[i, inds[i]]
return y
@njit
def next_node(node_id, value, thr):
"""A vectorized operation to find the next node in a binary tree given a
value and threshold.
Args:
node_id: int or array of the current node, root node_id = 0
Return:
new node id, left node when value <= thr, right node when value > thr
"""
return (node_id << 1) + 1 + (1 * (thr < value))
@njit
def leaf(X, features, thresholds, reset_leaf_index=1):
"""Find the leaf node index along a decision path given a tree feature
indices, thresholds, and design matrix.
Args:
X: 2D design matrix of shape [nsamples, nfeatures]
features: feature indices as dtype np.int32 of shape [internal_nodes]
thresholds: split thresholds as dtype np.float64 of shape [internal_nodes]
reset_leaf_index: returns leaf index initialized from 1
"""
nsamples = len(X)
node_id = np.zeros(nsamples, dtype=np.int64)
n = len(features)
depth = int(math.log(n+1)/math.log(2))
internal_nodes = 2**(depth) - 1
for i in range(depth):
feature_ind = features[node_id]
value = take(X, feature_ind)
thr = thresholds[node_id]
node_id = next_node(node_id, value, thr)
if reset_leaf_index == 1:
node_id = node_id - internal_nodes
return node_id
@njit
def leaf_tokens(X, trees, nleaves_per_tree):
"""Tokenize a design matrix with leaf indices.
Args:
X: 2D design matrix of shape [nsamples, nfeatures]
trees: 3D matrix deifining an ensemble of decision trees of shape
[ntrees, internal_nodes, 2]
nleaves_per_tree: number of leaves in each decision tree, 2**depth
"""
nsamples = X.shape[0]
ntrees = trees.shape[0]
leaves = np.zeros((nsamples,ntrees), dtype=np.int64)
for i in prange(0,ntrees):
features = trees[i,:,0].astype(np.int64)
thresholds = trees[i,:,1]
leaves[:,i] = leaf(X, features, thresholds) + nleaves_per_tree * i
return leaves
@njit
def random_ensemble_decision_trees(ntrees, depth, nfeatures):
"""Generate a random ensemble of decision trees
Args:
ntrees: number of trees in ensemble
depth: height of each tree
nfeatures: number of features in design matrix
Returns:
trees: 3D matrix of shape [ntrees, internal_nodes, 2] where internal_nodes
is the number of non-leaf nodes (including root node) calculated
as 2^{depth} - 1 and the last dimension includes feature index and
splitting thresholds
"""
internal_nodes = 2**(depth) - 1
total_internal_nodes = ntrees * internal_nodes
trees = np.zeros((total_internal_nodes, 2)) # [features, thresholds]
trees[:,0] = np.random.randint(0, nfeatures, total_internal_nodes)
trees[:,1] = np.random.rand(total_internal_nodes)
trees = trees.reshape(ntrees, internal_nodes, -1) # [ntrees, internal_nodes, 2]
return trees
nsamples, nfeatures = 1000, 30
X = np.random.rand(nsamples,nfeatures)
ntrees = 1000
depth = 1
nleaves = 2**depth
total_nleaves = ntrees * nleaves
trees = random_ensemble_decision_trees(ntrees=ntrees, depth=depth, nfeatures=nfeatures)
leaves = leaf_tokens(X, trees, nleaves)