Ensemble decision trees in Numba
2021-09-21
Representing ensembles of decision trees using numpy arrays with fast numba operations.
import math
import numpy as np
from numba import njit, prange
@njit
def take(X, inds):
"""Multidimensional indexing for numba"""
= len(X)
n = np.zeros(n, dtype=X.dtype)
y for i in range(n):
= X[i, inds[i]]
y[i] return y
@njit
def next_node(node_id, value, thr):
"""A vectorized operation to find the next node in a binary tree given a
value and threshold.
Args:
node_id: int or array of the current node, root node_id = 0
Return:
new node id, left node when value <= thr, right node when value > thr
"""
return (node_id << 1) + 1 + (1 * (thr < value))
@njit
def leaf(X, features, thresholds, reset_leaf_index=1):
"""Find the leaf node index along a decision path given a tree feature
indices, thresholds, and design matrix.
Args:
X: 2D design matrix of shape [nsamples, nfeatures]
features: feature indices as dtype np.int32 of shape [internal_nodes]
thresholds: split thresholds as dtype np.float64 of shape [internal_nodes]
reset_leaf_index: returns leaf index initialized from 1
"""
= len(X)
nsamples = np.zeros(nsamples, dtype=np.int64)
node_id = len(features)
n = int(math.log(n+1)/math.log(2))
depth = 2**(depth) - 1
internal_nodes for i in range(depth):
= features[node_id]
feature_ind = take(X, feature_ind)
value = thresholds[node_id]
thr = next_node(node_id, value, thr)
node_id if reset_leaf_index == 1:
= node_id - internal_nodes
node_id return node_id
@njit
def leaf_tokens(X, trees, nleaves_per_tree):
"""Tokenize a design matrix with leaf indices.
Args:
X: 2D design matrix of shape [nsamples, nfeatures]
trees: 3D matrix deifining an ensemble of decision trees of shape
[ntrees, internal_nodes, 2]
nleaves_per_tree: number of leaves in each decision tree, 2**depth
"""
= X.shape[0]
nsamples = trees.shape[0]
ntrees = np.zeros((nsamples,ntrees), dtype=np.int64)
leaves for i in prange(0,ntrees):
= trees[i,:,0].astype(np.int64)
features = trees[i,:,1]
thresholds = leaf(X, features, thresholds) + nleaves_per_tree * i
leaves[:,i] return leaves
@njit
def random_ensemble_decision_trees(ntrees, depth, nfeatures):
"""Generate a random ensemble of decision trees
Args:
ntrees: number of trees in ensemble
depth: height of each tree
nfeatures: number of features in design matrix
Returns:
trees: 3D matrix of shape [ntrees, internal_nodes, 2] where internal_nodes
is the number of non-leaf nodes (including root node) calculated
as 2^{depth} - 1 and the last dimension includes feature index and
splitting thresholds
"""
= 2**(depth) - 1
internal_nodes = ntrees * internal_nodes
total_internal_nodes = np.zeros((total_internal_nodes, 2)) # [features, thresholds]
trees 0] = np.random.randint(0, nfeatures, total_internal_nodes)
trees[:,1] = np.random.rand(total_internal_nodes)
trees[:,= trees.reshape(ntrees, internal_nodes, -1) # [ntrees, internal_nodes, 2]
trees return trees
= 1000, 30
nsamples, nfeatures = np.random.rand(nsamples,nfeatures)
X
= 1000
ntrees = 1
depth = 2**depth
nleaves = ntrees * nleaves
total_nleaves = random_ensemble_decision_trees(ntrees=ntrees, depth=depth, nfeatures=nfeatures)
trees = leaf_tokens(X, trees, nleaves) leaves