public final class DHistogram
extends water.Iced
A DHistogram bins every value added to it, and computes a the
vec min and max (for use in the next split), and response mean and variance
for each bin. DHistograms are initialized with a min, max and
number-of- elements to be added (all of which are generally available from
a Vec). Bins run from min to max in uniform sizes. If the DHistogram can determine that fewer bins are needed (e.g. boolean columns
run from 0 to 1, but only ever take on 2 values, so only 2 bins are
needed), then fewer bins are used.
DHistogram are shared per-node, and atomically updated. There's
an add call to help cross-node reductions. The data is stored in
primitive arrays, so it can be sent over the wire.
If we are successively splitting rows (e.g. in a decision tree), then a
fresh DHistogram for each split will dynamically re-bin the data.
Each successive split will logarithmically divide the data. At the first
split, outliers will end up in their own bins - but perhaps some central
bins may be very full. At the next split(s) - if they happen at all -
the full bins will get split, and again until (with a log number of splits)
each bin holds roughly the same amount of data. This 'UniformAdaptive' binning
resolves a lot of problems with picking the proper bin count or limits -
generally a few more tree levels will equal any fancy but fixed-size binning strategy.
Support for histogram split points based on quantiles (or random points) is
available as well, via _histoType.
| Modifier and Type | Class and Description |
|---|---|
static class |
DHistogram.NASplitDir
Split direction for missing values.
|
| Modifier and Type | Field and Description |
|---|---|
water.Key |
_globalQuantilesKey |
boolean |
_hasQuantiles |
SharedTreeModel.SharedTreeParameters.HistogramType |
_histoType |
byte |
_isInt |
double |
_maxEx |
protected double |
_maxIn |
double |
_min |
protected double |
_min2 |
double |
_minSplitImprovement |
java.lang.String |
_name |
char |
_nbin |
long |
_seed |
double[] |
_splitPts |
double |
_step |
double[] |
_w |
| Constructor and Description |
|---|
DHistogram(java.lang.String name,
int nbins,
int nbins_cats,
byte isInt,
double min,
double maxEx,
double minSplitImprovement,
SharedTreeModel.SharedTreeParameters.HistogramType histogramType,
long seed,
water.Key globalQuantilesKey) |
| Modifier and Type | Method and Description |
|---|---|
static int[] |
activeColumns(DHistogram[] hist) |
void |
add(DHistogram dsh) |
void |
add0(DHistogram dsh) |
int |
bin(double col_data) |
double |
binAt(int b) |
double |
bins(int b) |
double |
find_maxEx() |
static double |
find_maxEx(double maxIn,
int isInt) |
double |
find_maxIn() |
double |
find_min() |
DTree.Split |
findBestSplitPoint(int col,
double min_rows) |
void |
incr0(int b,
double y,
double w) |
void |
incr1(int b,
double y,
double yy) |
void |
init() |
static DHistogram[] |
initialHist(water.fvec.Frame fr,
int ncols,
int nbins,
DHistogram[] hs,
long seed,
SharedTreeModel.SharedTreeParameters parms,
water.Key[] globalQuantilesKey) |
static DHistogram |
make(java.lang.String name,
int nbins,
byte isInt,
double min,
double maxEx,
long seed,
SharedTreeModel.SharedTreeParameters parms,
water.Key globalQuantilesKey) |
int |
nbins() |
void |
setMaxIn(double max) |
void |
setMin(double min) |
java.lang.String |
toString() |
void |
updateSharedHistosAndReset(hex.tree.ScoreBuildHistogram.LocalHisto lh,
double[] ws,
double[] cs,
double[] ys,
int[] rows,
int hi,
int lo) |
double |
var(int b)
compute the sample variance within a given bin
|
public final transient java.lang.String _name
public final double _minSplitImprovement
public final byte _isInt
public char _nbin
public double _step
public final double _min
public final double _maxEx
public double[] _w
protected double _min2
protected double _maxIn
public SharedTreeModel.SharedTreeParameters.HistogramType _histoType
public transient double[] _splitPts
public final long _seed
public transient boolean _hasQuantiles
public water.Key _globalQuantilesKey
public DHistogram(java.lang.String name,
int nbins,
int nbins_cats,
byte isInt,
double min,
double maxEx,
double minSplitImprovement,
SharedTreeModel.SharedTreeParameters.HistogramType histogramType,
long seed,
water.Key globalQuantilesKey)
public static int[] activeColumns(DHistogram[] hist)
public void setMin(double min)
public void setMaxIn(double max)
public int bin(double col_data)
public double binAt(int b)
public int nbins()
public double bins(int b)
public void init()
public void add(DHistogram dsh)
public double find_min()
public double find_maxIn()
public double find_maxEx()
public static double find_maxEx(double maxIn,
int isInt)
public static DHistogram[] initialHist(water.fvec.Frame fr, int ncols, int nbins, DHistogram[] hs, long seed, SharedTreeModel.SharedTreeParameters parms, water.Key[] globalQuantilesKey)
public static DHistogram make(java.lang.String name, int nbins, byte isInt, double min, double maxEx, long seed, SharedTreeModel.SharedTreeParameters parms, water.Key globalQuantilesKey)
public java.lang.String toString()
toString in class java.lang.Objectpublic double var(int b)
b - bin idpublic void incr0(int b,
double y,
double w)
public void incr1(int b,
double y,
double yy)
public void add0(DHistogram dsh)
public DTree.Split findBestSplitPoint(int col, double min_rows)
public void updateSharedHistosAndReset(hex.tree.ScoreBuildHistogram.LocalHisto lh,
double[] ws,
double[] cs,
double[] ys,
int[] rows,
int hi,
int lo)