Quantile binning with missing data
2021-10-03
This uses numpy and numba for fast binning of numerical data to quantiles. It also supports missing data.
import numpy as np
from numba import njit
def _find_binning_thresholds(data, max_bins=256, subsample=int(2e5)):
if not (2 <= max_bins <= 256):
raise ValueError(
"max_bins={} should be no smaller than 2 "
"and no larger than 256.".format(max_bins)
)
= np.linspace(0, 100, num=max_bins + 1)
percentiles = percentiles[1:-1]
percentiles = []
binning_thresholds for f_idx in range(data.shape[1]):
= np.ascontiguousarray(data[:, f_idx], dtype=np.float64)
col_data = np.isfinite(col_data)
mask = col_data[mask]
col_data = np.unique(col_data)
distinct_values if len(distinct_values) <= max_bins:
= distinct_values[:-1] + distinct_values[1:]
midpoints *= 0.5
midpoints else:
= np.percentile(
midpoints ="midpoint"
col_data, percentiles, interpolation
).astype(np.float64)
binning_thresholds.append(np.unique(midpoints))return binning_thresholds
@njit()
def _map_num_col_to_bins(data, binning_thresholds, binned):
for i in range(data.shape[0]):
= 0, binning_thresholds.shape[0]
left, right while left < right:
= (right + left - 1) // 2
middle if data[i] <= binning_thresholds[middle]:
= middle
right else:
= middle + 1
left = left
binned[i]
def _map_to_bins(data, binning_thresholds, binned):
"""Bin numerical values to discrete integer-coded levels."""
for feature_idx in range(data.shape[1]):
_map_num_col_to_bins(
data[:, feature_idx],
binning_thresholds[feature_idx],
binned[:, feature_idx],
)
def _assign_nan_to_bin(binned, X, actual_n_bins, assign_nan_to_unique_bin=False):
= np.isnan(X)
mask for i in range(X.shape[1]):
= actual_n_bins[i] if assign_nan_to_unique_bin else np.nan
binned[mask[:, i], i] return binned
class QuantileBinning():
def __init__(self):
self.bin_thresholds = []
self.n_bins = []
def fit(self, X):
self.bin_thresholds = _find_binning_thresholds(X)
self.n_bins = np.array(
0] + 1 for thresholds in self.bin_thresholds], dtype=np.uint32
[thresholds.shape[
)
def transform(self, X, assign_nan_to_unique_bin=False):
= np.zeros_like(X, dtype=np.float32, order="F")
binned self.bin_thresholds, binned)
_map_to_bins(X, = _assign_nan_to_bin(binned, X, self.n_bins, assign_nan_to_unique_bin)
binned return binned