public class ModelSelectionUtils
extends java.lang.Object
| Modifier and Type | Class and Description |
|---|---|
static class |
ModelSelectionUtils.SweepVector
store information on sweeping actions that are to be performed to new rows/columns added to CPM due to the
addition of new predcitors.
|
| Constructor and Description |
|---|
ModelSelectionUtils() |
| Modifier and Type | Method and Description |
|---|---|
static double[][] |
addNewPred2CPM(double[][] allCPM,
double[][] currentCPM,
int[] subsetPredIndex,
int[][] pred2CPMIndices,
boolean hasIntercept)
Given current CPM which has been swept already, we need to add the lastest predictor to the current CPM that have
not been swept.
|
static void |
applySweepVectors2NewPred(ModelSelectionUtils.SweepVector[][] sweepVec,
double[][] subsetCPM,
int numNewRows,
int[] sweepMat)
This method will sweep the rows/columns added to the CPM due to the addition of the new predictor using sweep
vector arrays.
|
static GLM[] |
buildGLMBuilders(GLMModel.GLMParameters[] trainingParams) |
static double |
calR2Scale(water.fvec.Frame train,
java.lang.String resp) |
static double[][] |
createCrossProductMatrix(water.Key jobKey,
DataInfo dinfo) |
static java.util.List<java.lang.Integer> |
extractCPMIndexFromPred(double[][] allCPM,
int[][] pred2CPMIndices,
int[] newPredList,
boolean hasIntercept) |
static java.lang.String[] |
extractPredictorNames(hex.Model.Parameters parms,
DataInfo dinfo,
java.lang.String foldColumn) |
static double[][] |
extractPredSubsetsCPM(double[][] allCPM,
int[] predIndices,
int[][] pred2CPMIndices,
boolean hasIntercept)
Given a predictor subset and the complete CPM, we extract the CPM associated with the predictors
specified in the predictor subset (predIndices).
|
static int[] |
extractSweepIndices(java.util.List<java.lang.Integer> currSubsetIndices,
int predPos,
int predRemoved,
int[][] predInd2CPMIndices,
boolean hasIntercept)
Given predRemoved (the predictor that is to be removed and replaced in the forward step), this method will
calculate the locations of the CPM rows/columns associated with it.
|
static java.util.List<java.lang.String> |
extraModelColumnNames(java.util.List<java.lang.String> coefNames,
GLMModel bestModel) |
static GLMModel |
findBestModel(GLM[] glmResults)
Given GLM run results of a fixed number of predictors, find the model with the best R2 value.
|
static hex.modelselection.ModelSelectionUtils.PredNameMinZVal |
findCatMinZVal(GLMModel model,
java.util.List<java.lang.Double> zValList)
This method extracts the categorical coefficient z-value by using the following method:
1.
|
static int |
findMinZValue(GLMModel model,
java.util.List<java.lang.String> numPredNames,
java.util.List<java.lang.String> catPredNames,
java.util.List<java.lang.String> predNames) |
static hex.modelselection.ModelSelectionUtils.PredNameMinZVal |
findNumMinZVal(java.util.List<java.lang.String> numPredNames,
java.util.List<java.lang.Double> zValList,
java.util.List<java.lang.String> coeffNames) |
static void |
genBestSweepVector(ModelSelection.SweepModel bestModel,
double[][] cpm,
int[][] pred2CPMIndices,
boolean hasIntercept)
Given the predictor subset in bestModel, this method will perform sweeping on the predictor subset, generate
new sweep vector arrays and stored the new swept CPM back to bestModel.
|
static double[] |
generateAllErrorVariances(double[][] allCPM,
ModelSelectionUtils.SweepVector[][] sweepVec,
double[][] prevCPM,
java.util.List<java.lang.Integer> currSubsetIndices,
java.util.List<java.lang.Integer> validSubsets,
java.util.Set<java.util.BitSet> usedCombo,
java.util.BitSet tempIndices,
int[][] pred2CPMIndices,
boolean hasIntercept,
int predPos,
int removedPred,
int[] sweepIndices)
Given the prevCPM which contains the CPM generated from the last forward step search, we will add a new predictor
from the predictors in validSubsets and calculate its error variances defined in section IV of doc.
|
static GLMModel.GLMParameters[] |
generateGLMParameters(water.fvec.Frame[] trainingFrames,
ModelSelectionModel.ModelSelectionParameters parms,
int nfolds,
java.lang.String foldColumn,
hex.Model.Parameters.FoldAssignmentScheme foldAssignment) |
static water.fvec.Frame[] |
generateMaxRTrainingFrames(ModelSelectionModel.ModelSelectionParameters parms,
java.lang.String[] predictorNames,
java.lang.String foldColumn,
java.util.List<java.lang.Integer> currSubsetIndices,
int newPredPos,
java.util.List<java.lang.Integer> validSubsets,
java.util.Set<java.util.BitSet> usedCombo)
double
|
static water.fvec.Frame |
generateOneFrame(int[] predIndices,
hex.Model.Parameters parms,
java.lang.String[] predNames,
java.lang.String foldColumn)
Given a predictor indices set, this function will generate a training frame containing the predictors with
indices in predIndices.
|
static water.fvec.Frame[] |
generateTrainingFrames(ModelSelectionModel.ModelSelectionParameters parms,
int predNum,
java.lang.String[] predNames,
int numModels,
java.lang.String foldColumn) |
static void |
genMSE1stPred(int[][] pred2CPMIndices,
double[][] allCPM,
int[] allPreds,
double[] subsetMSE,
jsr166y.RecursiveAction[] resA,
int resCount,
boolean hasIntercept) |
static void |
genMSE4MorePreds(int[][] pred2CPMIndices,
double[][] allCPM,
ModelSelectionUtils.SweepVector[][] sweepVec,
int[] allPreds,
double[][] prevCPM,
double[] subsetMSE,
jsr166y.RecursiveAction[] resA,
int resCount,
boolean hasIntercept,
int predPos,
int removedPred,
int[] sweepIndices) |
static ModelSelectionUtils.SweepVector[] |
genNewSV(ModelSelectionUtils.SweepVector[] oldSV,
double[][] subsetCPM,
int newPredLen,
int sweepInd)
Giving new subsetCPM with added predictor, we need to go back to all sweep vectors, increase its size to
include the newly added predictor and add the correct contents
|
static java.lang.String |
joinDouble(double[] val) |
static ModelSelectionUtils.SweepVector[][] |
mapBasicVector2Multiple(ModelSelectionUtils.SweepVector[][] sweepVec,
int newPredCPMLen)
When multiple rows/columns are added to the CPM due to the new predictor being categorical, we need to map the
old sweep vector arrays to new bigger sweep vector arrays.
|
static int[][] |
mapPredIndex2CPMIndices(DataInfo dinfo,
int predLength) |
static ModelSelectionUtils.SweepVector[][] |
mergeSV(ModelSelectionUtils.SweepVector[][] sweepVector,
ModelSelectionUtils.SweepVector[][] removedPredSV) |
static void |
oneSweepWSweepVector(ModelSelectionUtils.SweepVector[] sweepVec,
double[][] subsetCPM,
int sweepIndex,
int colRowsAdded)
This method perform just one sweep of the sweeping action described in Step 3 of section V.II.IV of doc.
|
static void |
performOneSweep(double[][] subsetCPM,
ModelSelectionUtils.SweepVector[] sweepVec,
int sweepIndex,
boolean genSweepVector)
Perform one sweep according to section II of doc and generate sweep vector according to section V.II of doc.
|
static void |
process(hex.modelselection.ModelSelectionUtils.SweepElement currEle,
java.util.List<hex.modelselection.ModelSelectionUtils.SweepElement> tempList)
This method will generate all the elements that are needed to perform sweeping on the currEle.
|
static void |
removeTrainingFrames(water.fvec.Frame[] trainingFrames) |
static double[][] |
replaceCPMwNewPred(double[][] cpm,
double[][] allCPM,
int[][] pred2CPM,
int[] oldSweepIndices,
int[] newSweepIndices,
int[] predSubset,
boolean hasIntercept) |
static void |
replaceSweepVectors(ModelSelectionUtils.SweepVector[][] origSV,
ModelSelectionUtils.SweepVector[][] newSV,
int startIndex) |
static void |
setBitSet(java.util.BitSet predBitSet,
int[] currIndices) |
static void |
setParamField(hex.Model.Parameters params,
GLMModel.GLMParameters glmParam,
boolean superClassParams,
java.lang.reflect.Field[] paramFields,
java.util.List<java.lang.String> excludeList) |
static double[][] |
shrinkDoubleArray(double[][] array,
int numModels) |
static water.Key[] |
shrinkKeyArray(water.Key[] array,
int numModels) |
static java.lang.String[][] |
shrinkStringArray(java.lang.String[][] array,
int numModels) |
static ModelSelectionUtils.SweepVector[][] |
sweepCPM(double[][] subsetCPM,
int[] sweepIndices,
boolean genSweepVector)
This method perform the sweeping action described in section II of doc.
|
static void |
sweepCPMElements(java.util.Set<hex.modelselection.ModelSelectionUtils.SweepElement>[] sweepElements,
double[][] subsetCPM) |
static void |
sweepCPMNewPredwSVs(double[][] subsetCPM,
int sweepIndex,
ModelSelectionUtils.SweepVector[] sv,
java.util.List<java.lang.Integer> newSweepIndices)
This method will perform sweeping on the rows/columns of CPM corresponding to the newly replaced predictor.
|
static double |
sweepMSE(double[][] subsetCPM,
java.util.List<java.lang.Integer> sweepIndices) |
static double[][] |
unsweptPredAfterReplacedPred(int[] predSubset,
double[][] subsetCPM,
double[][] origCPM,
int[][] predInd2CPMInd,
boolean hasIntercept,
int predPos,
int[] sweepIndicesRemovedPred,
java.util.List<java.lang.Integer> newAllSweepIndices) |
static void |
updateCPMSV(ModelSelection.SweepModel bestModel,
double[][] subsetCPM,
int[] newSweepIndices,
java.util.List<java.lang.Integer> newAllSweepIndices,
int[] sweepIndicesRemovedPred)
Given CPM, sweep vector from forward model and the new predictor, this function aims to do the following:
1.
|
static void |
updateLaterIndices(int[] currentPredIndices,
int indexUpdated,
int lastPredInd)
Give 5 predictors and say we want the combo of 3 predictors, this function will properly reset the prediction
combination indices say from [0, 1, 4] -> [0, 2, 3] or [0, 3, 4] -> [1, 2, 3].
|
static void |
updatePredIndices(int[] currentPredIndices,
int[] indicesBounds)
Given predictor indices stored in currentPredIndices, we need to find the next combination of predictor indices
to use to generate the next combination.
|
static ModelSelectionUtils.SweepVector[][] |
updateSweepVectors(double[][] subsetCPM,
ModelSelectionUtils.SweepVector[][] sweepVector,
int[] subsetPred,
int[][] predInd2CPMInd)
Given the predictor subset stored in subsetPred, the newest predictor index is the last element of subsetPred.
|
public static water.fvec.Frame[] generateTrainingFrames(ModelSelectionModel.ModelSelectionParameters parms, int predNum, java.lang.String[] predNames, int numModels, java.lang.String foldColumn)
public static void updatePredIndices(int[] currentPredIndices,
int[] indicesBounds)
currentPredIndices - indicesBounds - public static void updateLaterIndices(int[] currentPredIndices,
int indexUpdated,
int lastPredInd)
currentPredIndices - indexUpdated - lastPredInd - public static water.fvec.Frame generateOneFrame(int[] predIndices,
hex.Model.Parameters parms,
java.lang.String[] predNames,
java.lang.String foldColumn)
predIndices - parms - predNames - public static void setBitSet(java.util.BitSet predBitSet,
int[] currIndices)
public static int[][] mapPredIndex2CPMIndices(DataInfo dinfo, int predLength)
public static double[][] createCrossProductMatrix(water.Key jobKey,
DataInfo dinfo)
public static double calR2Scale(water.fvec.Frame train,
java.lang.String resp)
public static water.fvec.Frame[] generateMaxRTrainingFrames(ModelSelectionModel.ModelSelectionParameters parms, java.lang.String[] predictorNames, java.lang.String foldColumn, java.util.List<java.lang.Integer> currSubsetIndices, int newPredPos, java.util.List<java.lang.Integer> validSubsets, java.util.Set<java.util.BitSet> usedCombo)
predictorNames - foldColumn - currSubsetIndices - validSubsets - Lists containing only valid predictor indices to choose frompublic static double[][] unsweptPredAfterReplacedPred(int[] predSubset,
double[][] subsetCPM,
double[][] origCPM,
int[][] predInd2CPMInd,
boolean hasIntercept,
int predPos,
int[] sweepIndicesRemovedPred,
java.util.List<java.lang.Integer> newAllSweepIndices)
public static double[] generateAllErrorVariances(double[][] allCPM,
ModelSelectionUtils.SweepVector[][] sweepVec,
double[][] prevCPM,
java.util.List<java.lang.Integer> currSubsetIndices,
java.util.List<java.lang.Integer> validSubsets,
java.util.Set<java.util.BitSet> usedCombo,
java.util.BitSet tempIndices,
int[][] pred2CPMIndices,
boolean hasIntercept,
int predPos,
int removedPred,
int[] sweepIndices)
public static void genMSE4MorePreds(int[][] pred2CPMIndices,
double[][] allCPM,
ModelSelectionUtils.SweepVector[][] sweepVec,
int[] allPreds,
double[][] prevCPM,
double[] subsetMSE,
jsr166y.RecursiveAction[] resA,
int resCount,
boolean hasIntercept,
int predPos,
int removedPred,
int[] sweepIndices)
public static void updateCPMSV(ModelSelection.SweepModel bestModel, double[][] subsetCPM, int[] newSweepIndices, java.util.List<java.lang.Integer> newAllSweepIndices, int[] sweepIndicesRemovedPred)
public static void replaceSweepVectors(ModelSelectionUtils.SweepVector[][] origSV, ModelSelectionUtils.SweepVector[][] newSV, int startIndex)
public static void sweepCPMNewPredwSVs(double[][] subsetCPM,
int sweepIndex,
ModelSelectionUtils.SweepVector[] sv,
java.util.List<java.lang.Integer> newSweepIndices)
public static double[][] replaceCPMwNewPred(double[][] cpm,
double[][] allCPM,
int[][] pred2CPM,
int[] oldSweepIndices,
int[] newSweepIndices,
int[] predSubset,
boolean hasIntercept)
public static double sweepMSE(double[][] subsetCPM,
java.util.List<java.lang.Integer> sweepIndices)
public static void sweepCPMElements(java.util.Set<hex.modelselection.ModelSelectionUtils.SweepElement>[] sweepElements,
double[][] subsetCPM)
public static void process(hex.modelselection.ModelSelectionUtils.SweepElement currEle,
java.util.List<hex.modelselection.ModelSelectionUtils.SweepElement> tempList)
public static void genMSE1stPred(int[][] pred2CPMIndices,
double[][] allCPM,
int[] allPreds,
double[] subsetMSE,
jsr166y.RecursiveAction[] resA,
int resCount,
boolean hasIntercept)
public static ModelSelectionUtils.SweepVector[][] mapBasicVector2Multiple(ModelSelectionUtils.SweepVector[][] sweepVec, int newPredCPMLen)
public static void applySweepVectors2NewPred(ModelSelectionUtils.SweepVector[][] sweepVec, double[][] subsetCPM, int numNewRows, int[] sweepMat)
public static void oneSweepWSweepVector(ModelSelectionUtils.SweepVector[] sweepVec, double[][] subsetCPM, int sweepIndex, int colRowsAdded)
public static double[][] addNewPred2CPM(double[][] allCPM,
double[][] currentCPM,
int[] subsetPredIndex,
int[][] pred2CPMIndices,
boolean hasIntercept)
public static ModelSelectionUtils.SweepVector[][] updateSweepVectors(double[][] subsetCPM, ModelSelectionUtils.SweepVector[][] sweepVector, int[] subsetPred, int[][] predInd2CPMInd)
public static int[] extractSweepIndices(java.util.List<java.lang.Integer> currSubsetIndices,
int predPos,
int predRemoved,
int[][] predInd2CPMIndices,
boolean hasIntercept)
public static ModelSelectionUtils.SweepVector[][] mergeSV(ModelSelectionUtils.SweepVector[][] sweepVector, ModelSelectionUtils.SweepVector[][] removedPredSV)
public static ModelSelectionUtils.SweepVector[] genNewSV(ModelSelectionUtils.SweepVector[] oldSV, double[][] subsetCPM, int newPredLen, int sweepInd)
public static java.util.List<java.lang.Integer> extractCPMIndexFromPred(double[][] allCPM,
int[][] pred2CPMIndices,
int[] newPredList,
boolean hasIntercept)
public static void genBestSweepVector(ModelSelection.SweepModel bestModel, double[][] cpm, int[][] pred2CPMIndices, boolean hasIntercept)
public static ModelSelectionUtils.SweepVector[][] sweepCPM(double[][] subsetCPM, int[] sweepIndices, boolean genSweepVector)
public static void performOneSweep(double[][] subsetCPM,
ModelSelectionUtils.SweepVector[] sweepVec,
int sweepIndex,
boolean genSweepVector)
public static java.lang.String[][] shrinkStringArray(java.lang.String[][] array,
int numModels)
public static double[][] shrinkDoubleArray(double[][] array,
int numModels)
public static water.Key[] shrinkKeyArray(water.Key[] array,
int numModels)
public static java.lang.String joinDouble(double[] val)
public static GLMModel.GLMParameters[] generateGLMParameters(water.fvec.Frame[] trainingFrames, ModelSelectionModel.ModelSelectionParameters parms, int nfolds, java.lang.String foldColumn, hex.Model.Parameters.FoldAssignmentScheme foldAssignment)
public static void setParamField(hex.Model.Parameters params,
GLMModel.GLMParameters glmParam,
boolean superClassParams,
java.lang.reflect.Field[] paramFields,
java.util.List<java.lang.String> excludeList)
public static GLM[] buildGLMBuilders(GLMModel.GLMParameters[] trainingParams)
public static void removeTrainingFrames(water.fvec.Frame[] trainingFrames)
public static GLMModel findBestModel(GLM[] glmResults)
glmResults - public static java.lang.String[] extractPredictorNames(hex.Model.Parameters parms,
DataInfo dinfo,
java.lang.String foldColumn)
public static int findMinZValue(GLMModel model, java.util.List<java.lang.String> numPredNames, java.util.List<java.lang.String> catPredNames, java.util.List<java.lang.String> predNames)
public static hex.modelselection.ModelSelectionUtils.PredNameMinZVal findNumMinZVal(java.util.List<java.lang.String> numPredNames,
java.util.List<java.lang.Double> zValList,
java.util.List<java.lang.String> coeffNames)
public static hex.modelselection.ModelSelectionUtils.PredNameMinZVal findCatMinZVal(GLMModel model, java.util.List<java.lang.Double> zValList)
public static java.util.List<java.lang.String> extraModelColumnNames(java.util.List<java.lang.String> coefNames,
GLMModel bestModel)
public static double[][] extractPredSubsetsCPM(double[][] allCPM,
int[] predIndices,
int[][] pred2CPMIndices,
boolean hasIntercept)