Asif Rahman

September 21, 2021

Ensemble decision trees in Numba

Representing ensembles of decision trees using numpy arrays with fast numba operations.

import math
import numpy as np
from numba import njit, prange


@njit
def take(X, inds):
    """Multidimensional indexing for numba"""
    n = len(X)
    y = np.zeros(n, dtype=X.dtype)
    for i in range(n):
        y[i] = X[i, inds[i]]
    return y


@njit
def next_node(node_id, value, thr):
    """A vectorized operation to find the next node in a binary tree given a 
    value and threshold.
    Args:
        node_id: int or array of the current node, root node_id = 0
    Return:
        new node id, left node when value <= thr, right node when value > thr
    """
    return (node_id << 1) + 1 + (1 * (thr < value))


@njit
def leaf(X, features, thresholds, reset_leaf_index=1):
    """Find the leaf node index along a decision path given a tree feature 
    indices, thresholds, and design matrix.
    Args:
        X: 2D design matrix of shape [nsamples, nfeatures]
        features: feature indices as dtype np.int32 of shape [internal_nodes]
        thresholds: split thresholds as dtype np.float64 of shape [internal_nodes]
        reset_leaf_index: returns leaf index initialized from 1
    """
    nsamples = len(X)
    node_id = np.zeros(nsamples, dtype=np.int64)
    n = len(features)
    depth = int(math.log(n+1)/math.log(2))
    internal_nodes = 2**(depth) - 1
    for i in range(depth):
        feature_ind = features[node_id]
        value = take(X, feature_ind)
        thr = thresholds[node_id]
        node_id = next_node(node_id, value, thr)
    if reset_leaf_index == 1:
        node_id = node_id - internal_nodes
    return node_id


@njit
def leaf_tokens(X, trees, nleaves_per_tree):
    """Tokenize a design matrix with leaf indices.
    Args:
        X: 2D design matrix of shape [nsamples, nfeatures]
        trees: 3D matrix deifining an ensemble of decision trees of shape 
            [ntrees, internal_nodes, 2]
        nleaves_per_tree: number of leaves in each decision tree, 2**depth
    """
    nsamples = X.shape[0]
    ntrees = trees.shape[0]
    leaves = np.zeros((nsamples,ntrees), dtype=np.int64)
    for i in prange(0,ntrees):
        features = trees[i,:,0].astype(np.int64)
        thresholds = trees[i,:,1]
        leaves[:,i] = leaf(X, features, thresholds) + nleaves_per_tree * i
    return leaves


@njit
def random_ensemble_decision_trees(ntrees, depth, nfeatures):
    """Generate a random ensemble of decision trees
    Args:
        ntrees: number of trees in ensemble
        depth: height of each tree
        nfeatures: number of features in design matrix
    Returns:
        trees: 3D matrix of shape [ntrees, internal_nodes, 2] where internal_nodes
            is the number of non-leaf nodes (including root node) calculated
            as 2^{depth} - 1 and the last dimension includes feature index and 
            splitting thresholds
    """
    internal_nodes = 2**(depth) - 1
    total_internal_nodes = ntrees * internal_nodes
    trees = np.zeros((total_internal_nodes, 2)) # [features, thresholds]
    trees[:,0] = np.random.randint(0, nfeatures, total_internal_nodes)
    trees[:,1] = np.random.rand(total_internal_nodes)
    trees = trees.reshape(ntrees, internal_nodes, -1)  # [ntrees, internal_nodes, 2]
    return trees


nsamples, nfeatures = 1000, 30
X = np.random.rand(nsamples,nfeatures)

ntrees = 1000
depth = 1
nleaves = 2**depth
total_nleaves = ntrees * nleaves
trees = random_ensemble_decision_trees(ntrees=ntrees, depth=depth, nfeatures=nfeatures)
leaves = leaf_tokens(X, trees, nleaves)