public final class DHistogram
extends water.Iced
A DHistogram bins every value added to it, and computes a the
vec min and max (for use in the next split), and response mean and variance
for each bin. DHistograms are initialized with a min, max and
number-of- elements to be added (all of which are generally available from
a Vec). Bins run from min to max in uniform sizes. If the DHistogram can determine that fewer bins are needed (e.g. boolean columns
run from 0 to 1, but only ever take on 2 values, so only 2 bins are
needed), then fewer bins are used.
DHistogram are shared per-node, and atomically updated. There's
an add call to help cross-node reductions. The data is stored in
primitive arrays, so it can be sent over the wire.
If we are successively splitting rows (e.g. in a decision tree), then a
fresh DHistogram for each split will dynamically re-bin the data.
Each successive split will logarithmically divide the data. At the first
split, outliers will end up in their own bins - but perhaps some central
bins may be very full. At the next split(s) - if they happen at all -
the full bins will get split, and again until (with a log number of splits)
each bin holds roughly the same amount of data. This 'UniformAdaptive' binning
resolves a lot of problems with picking the proper bin count or limits -
generally a few more tree levels will equal any fancy but fixed-size binning strategy.
Support for histogram split points based on quantiles (or random points) is
available as well, via _histoType.
| Modifier and Type | Class and Description |
|---|---|
static class |
DHistogram.NASplitDir
Split direction for missing values.
|
| Modifier and Type | Field and Description |
|---|---|
water.Key |
_globalQuantilesKey |
boolean |
_hasQuantiles |
SharedTreeModel.SharedTreeParameters.HistogramType |
_histoType |
byte |
_isInt |
double |
_maxEx |
protected double |
_maxIn |
double |
_min |
protected double |
_min2 |
double |
_minSplitImprovement |
java.lang.String |
_name |
char |
_nbin |
double |
_pred1 |
double |
_pred2 |
long |
_seed |
double[] |
_splitPts |
double |
_step |
protected double[] |
_vals |
protected int |
_vals_dim |
| Constructor and Description |
|---|
DHistogram(java.lang.String name,
int nbins,
int nbins_cats,
byte isInt,
double min,
double maxEx,
double minSplitImprovement,
SharedTreeModel.SharedTreeParameters.HistogramType histogramType,
long seed,
water.Key globalQuantilesKey,
Constraints cs) |
| Modifier and Type | Method and Description |
|---|---|
static int[] |
activeColumns(DHistogram[] hist) |
void |
add(DHistogram dsh) |
void |
addNasAtomic(double y,
double wy,
double wyy) |
void |
addWAtomic(int i,
double wDelta) |
int |
bin(double col_data) |
double |
binAt(int b) |
double |
bins(int b) |
double |
denNA() |
double |
find_maxEx() |
static double |
find_maxEx(double maxIn,
int isInt) |
double |
find_maxIn() |
double |
find_min() |
void |
incr0(int b,
double y,
double w) |
void |
init() |
void |
init(double[] vals) |
static DHistogram[] |
initialHist(water.fvec.Frame fr,
int ncols,
int nbins,
DHistogram[] hs,
long seed,
SharedTreeModel.SharedTreeParameters parms,
water.Key[] globalQuantilesKey,
Constraints cs) |
static DHistogram |
make(java.lang.String name,
int nbins,
byte isInt,
double min,
double maxEx,
long seed,
SharedTreeModel.SharedTreeParameters parms,
water.Key globalQuantilesKey,
Constraints cs) |
int |
nbins() |
void |
reducePrecision()
Cast bin values *except for sums of weights and Na-bucket counters to floats to drop least significant bits.
|
double |
seP1NA()
Squared Error for NA bucket and prediction value _pred1
|
double |
seP2NA()
Squared Error for NA bucket and prediction value _pred2
|
void |
setMaxIn(double max) |
void |
setMin(double min) |
java.lang.String |
toString() |
void |
updateSharedHistosAndReset(hex.tree.ScoreBuildHistogram.LocalHisto lh,
double[] ws,
double[] cs,
double[] ys,
int[] rows,
int hi,
int lo) |
double |
var(int b)
compute the sample variance within a given bin
|
double |
w(int i) |
double |
wNA() |
double |
wY(int i) |
double |
wYNA() |
double |
wYY(int i) |
double |
wYYNA() |
public final transient java.lang.String _name
public final double _minSplitImprovement
public final byte _isInt
public char _nbin
public double _step
public final double _min
public final double _maxEx
public final double _pred1
public final double _pred2
protected double[] _vals
protected final int _vals_dim
protected double _min2
protected double _maxIn
public SharedTreeModel.SharedTreeParameters.HistogramType _histoType
public transient double[] _splitPts
public final long _seed
public transient boolean _hasQuantiles
public water.Key _globalQuantilesKey
public DHistogram(java.lang.String name,
int nbins,
int nbins_cats,
byte isInt,
double min,
double maxEx,
double minSplitImprovement,
SharedTreeModel.SharedTreeParameters.HistogramType histogramType,
long seed,
water.Key globalQuantilesKey,
Constraints cs)
public double w(int i)
public double wY(int i)
public double wYY(int i)
public void addWAtomic(int i,
double wDelta)
public void addNasAtomic(double y,
double wy,
double wyy)
public double wNA()
public double wYNA()
public double wYYNA()
public double seP1NA()
public double seP2NA()
public double denNA()
public static int[] activeColumns(DHistogram[] hist)
public void setMin(double min)
public void setMaxIn(double max)
public int bin(double col_data)
public double binAt(int b)
public int nbins()
public double bins(int b)
public void init()
public void init(double[] vals)
public void add(DHistogram dsh)
public double find_min()
public double find_maxIn()
public double find_maxEx()
public static double find_maxEx(double maxIn,
int isInt)
public static DHistogram[] initialHist(water.fvec.Frame fr, int ncols, int nbins, DHistogram[] hs, long seed, SharedTreeModel.SharedTreeParameters parms, water.Key[] globalQuantilesKey, Constraints cs)
public static DHistogram make(java.lang.String name, int nbins, byte isInt, double min, double maxEx, long seed, SharedTreeModel.SharedTreeParameters parms, water.Key globalQuantilesKey, Constraints cs)
public java.lang.String toString()
toString in class java.lang.Objectpublic double var(int b)
b - bin idpublic void incr0(int b,
double y,
double w)
public void reducePrecision()
public void updateSharedHistosAndReset(hex.tree.ScoreBuildHistogram.LocalHisto lh,
double[] ws,
double[] cs,
double[] ys,
int[] rows,
int hi,
int lo)