public final class DHistogram
extends water.Iced
A DHistogram
bins every value added to it, and computes a the
vec min and max (for use in the next split), and response mean and variance
for each bin. DHistogram
s are initialized with a min, max and
number-of- elements to be added (all of which are generally available from
a Vec). Bins run from min to max in uniform sizes. If the DHistogram
can determine that fewer bins are needed (e.g. boolean columns
run from 0 to 1, but only ever take on 2 values, so only 2 bins are
needed), then fewer bins are used.
DHistogram
are shared per-node, and atomically updated. There's
an add
call to help cross-node reductions. The data is stored in
primitive arrays, so it can be sent over the wire.
If we are successively splitting rows (e.g. in a decision tree), then a
fresh DHistogram
for each split will dynamically re-bin the data.
Each successive split will logarithmically divide the data. At the first
split, outliers will end up in their own bins - but perhaps some central
bins may be very full. At the next split(s) - if they happen at all -
the full bins will get split, and again until (with a log number of splits)
each bin holds roughly the same amount of data. This 'UniformAdaptive' binning
resolves a lot of problems with picking the proper bin count or limits -
generally a few more tree levels will equal any fancy but fixed-size binning strategy.
Support for histogram split points based on quantiles (or random points) is
available as well, via _histoType
.
Modifier and Type | Class and Description |
---|---|
static class |
DHistogram.NASplitDir
Split direction for missing values.
|
Modifier and Type | Field and Description |
---|---|
water.Key |
_globalQuantilesKey |
boolean |
_hasQuantiles |
SharedTreeModel.SharedTreeParameters.HistogramType |
_histoType |
byte |
_isInt |
double |
_maxEx |
protected double |
_maxIn |
double |
_min |
protected double |
_min2 |
double |
_minSplitImprovement |
java.lang.String |
_name |
char |
_nbin |
long |
_seed |
double[] |
_splitPts |
double |
_step |
protected double[] |
_vals |
Constructor and Description |
---|
DHistogram(java.lang.String name,
int nbins,
int nbins_cats,
byte isInt,
double min,
double maxEx,
double minSplitImprovement,
SharedTreeModel.SharedTreeParameters.HistogramType histogramType,
long seed,
water.Key globalQuantilesKey) |
Modifier and Type | Method and Description |
---|---|
static int[] |
activeColumns(DHistogram[] hist) |
void |
add(DHistogram dsh) |
void |
addNasAtomic(double y,
double wy,
double wyy) |
void |
addNasPlain(double... ds) |
void |
addWAtomic(int i,
double wDelta) |
int |
bin(double col_data) |
double |
binAt(int b) |
double |
bins(int b) |
double |
find_maxEx() |
static double |
find_maxEx(double maxIn,
int isInt) |
double |
find_maxIn() |
double |
find_min() |
void |
incr0(int b,
double y,
double w) |
void |
incr1(int b,
double y,
double yy) |
void |
init() |
void |
init(double[] vals) |
static DHistogram[] |
initialHist(water.fvec.Frame fr,
int ncols,
int nbins,
DHistogram[] hs,
long seed,
SharedTreeModel.SharedTreeParameters parms,
water.Key[] globalQuantilesKey) |
static DHistogram |
make(java.lang.String name,
int nbins,
byte isInt,
double min,
double maxEx,
long seed,
SharedTreeModel.SharedTreeParameters parms,
water.Key globalQuantilesKey) |
int |
nbins() |
void |
reducePrecision()
Cast bin values *except for sums of weights and Na-bucket counters to floats to drop least significant bits.
|
void |
setMaxIn(double max) |
void |
setMin(double min) |
java.lang.String |
toString() |
void |
updateHisto(double[] ws,
double[] cs,
double[] ys,
int[] rows,
int hi,
int lo)
Update counts in appropriate bins.
|
void |
updateSharedHistosAndReset(hex.tree.ScoreBuildHistogram.LocalHisto lh,
double[] ws,
double[] cs,
double[] ys,
int[] rows,
int hi,
int lo) |
double |
var(int b)
compute the sample variance within a given bin
|
double |
w(int i) |
double |
wNA() |
double |
wY(int i) |
double |
wYNA() |
double |
wYY(int i) |
double |
wYYNA() |
public final transient java.lang.String _name
public final double _minSplitImprovement
public final byte _isInt
public char _nbin
public double _step
public final double _min
public final double _maxEx
protected double[] _vals
protected double _min2
protected double _maxIn
public SharedTreeModel.SharedTreeParameters.HistogramType _histoType
public transient double[] _splitPts
public final long _seed
public transient boolean _hasQuantiles
public water.Key _globalQuantilesKey
public DHistogram(java.lang.String name, int nbins, int nbins_cats, byte isInt, double min, double maxEx, double minSplitImprovement, SharedTreeModel.SharedTreeParameters.HistogramType histogramType, long seed, water.Key globalQuantilesKey)
public double w(int i)
public double wY(int i)
public double wYY(int i)
public void addWAtomic(int i, double wDelta)
public void addNasAtomic(double y, double wy, double wyy)
public void addNasPlain(double... ds)
public double wNA()
public double wYNA()
public double wYYNA()
public static int[] activeColumns(DHistogram[] hist)
public void setMin(double min)
public void setMaxIn(double max)
public int bin(double col_data)
public double binAt(int b)
public int nbins()
public double bins(int b)
public void init()
public void init(double[] vals)
public void add(DHistogram dsh)
public double find_min()
public double find_maxIn()
public double find_maxEx()
public static double find_maxEx(double maxIn, int isInt)
public static DHistogram[] initialHist(water.fvec.Frame fr, int ncols, int nbins, DHistogram[] hs, long seed, SharedTreeModel.SharedTreeParameters parms, water.Key[] globalQuantilesKey)
public static DHistogram make(java.lang.String name, int nbins, byte isInt, double min, double maxEx, long seed, SharedTreeModel.SharedTreeParameters parms, water.Key globalQuantilesKey)
public java.lang.String toString()
toString
in class java.lang.Object
public double var(int b)
b
- bin idpublic void incr0(int b, double y, double w)
public void incr1(int b, double y, double yy)
public void updateHisto(double[] ws, double[] cs, double[] ys, int[] rows, int hi, int lo)
ws
- observation weightscs
- column datays
- responserows
- rows sorted by leaf assignemnthi
- upper bound on index into rows array to be processed by this call (exclusive)lo
- lower bound on index into rows array to be processed by this call (inclusive)public void reducePrecision()
public void updateSharedHistosAndReset(hex.tree.ScoreBuildHistogram.LocalHisto lh, double[] ws, double[] cs, double[] ys, int[] rows, int hi, int lo)