Java tutorial
package hex; import hex.genmodel.GenModel; import hex.genmodel.MojoModel; import hex.genmodel.easy.EasyPredictModelWrapper; import hex.genmodel.easy.RowData; import hex.genmodel.easy.exception.PredictException; import hex.genmodel.easy.prediction.*; import hex.genmodel.utils.DistributionFamily; import org.apache.commons.io.IOUtils; import org.joda.time.DateTime; import water.*; import water.api.FSIOException; import water.api.ModelsHandler; import water.api.StreamWriter; import water.api.StreamingSchema; import water.api.schemas3.KeyV3; import water.codegen.CodeGenerator; import water.codegen.CodeGeneratorPipeline; import water.exceptions.JCodeSB; import water.fvec.*; import water.parser.BufferedString; import water.persist.Persist; import water.udf.CFuncRef; import water.util.*; import java.io.*; import java.lang.reflect.Field; import java.net.URI; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import static water.util.FrameUtils.categoricalEncoder; import static water.util.FrameUtils.cleanUp; /** * A Model models reality (hopefully). * A model can be used to 'score' a row (make a prediction), or a collection of * rows on any compatible dataset - meaning the row has all the columns with the * same names as used to build the mode and any categorical columns can * be adapted. */ public abstract class Model<M extends Model<M, P, O>, P extends Model.Parameters, O extends Model.Output> extends Lockable<M> { public P _parms; // TODO: move things around so that this can be protected public O _output; // TODO: move things around so that this can be protected public String[] _warnings = new String[0]; // warning associated with model building public String[] _warningsP; // warnings associated with prediction only public Distribution _dist; protected ScoringInfo[] scoringInfo; public IcedHashMap<Key, String> _toDelete = new IcedHashMap<>(); public static Model[] fetchAll() { final Key[] modelKeys = KeySnapshot.globalSnapshot().filter(new KeySnapshot.KVFilter() { @Override public boolean filter(KeySnapshot.KeyInfo k) { return Value.isSubclassOf(k._type, Model.class); } }).keys(); Model[] models = new Model[modelKeys.length]; for (int i = 0; i < modelKeys.length; i++) { Model model = ModelsHandler.getFromDKV("(none)", modelKeys[i]); models[i] = model; } return models; } public interface DeepFeatures { Frame scoreAutoEncoder(Frame frame, Key destination_key, boolean reconstruction_error_per_feature); Frame scoreDeepFeatures(Frame frame, final int layer); Frame scoreDeepFeatures(Frame frame, final int layer, final Job j); //for Deep Learning Frame scoreDeepFeatures(Frame frame, final String layer, final Job j); //for Deep Water } public interface GLRMArchetypes { Frame scoreReconstruction(Frame frame, Key<Frame> destination_key, boolean reverse_transform); Frame scoreArchetypes(Frame frame, Key<Frame> destination_key, boolean reverse_transform); } public interface LeafNodeAssignment { Frame scoreLeafNodeAssignment(Frame frame, Key<Frame> destination_key); } public interface ExemplarMembers { Frame scoreExemplarMembers(Key<Frame> destination_key, int exemplarIdx); } public interface GetMostImportantFeatures { String[] getMostImportantFeatures(int n); } /** * Default threshold for assigning class labels to the target class (for binomial models) * @return threshold in 0...1 */ public double defaultThreshold() { return defaultThreshold(_output); } public static <O extends Model.Output> double defaultThreshold(O output) { if (output.nclasses() != 2 || output._training_metrics == null) return 0.5; if (output._validation_metrics != null && ((ModelMetricsBinomial) output._validation_metrics)._auc != null) return ((ModelMetricsBinomial) output._validation_metrics)._auc.defaultThreshold(); if (((ModelMetricsBinomial) output._training_metrics)._auc != null) return ((ModelMetricsBinomial) output._training_metrics)._auc.defaultThreshold(); return 0.5; } public final boolean isSupervised() { return _output.isSupervised(); } public boolean havePojo() { return ModelBuilder.havePojo(_parms.algoName()); } public boolean haveMojo() { return ModelBuilder.haveMojo(_parms.algoName()); } /** * Identifies the default ordering method for models returned from Grid Search * @return default sort-by */ public GridSortBy getDefaultGridSortBy() { if (!isSupervised()) return null; else if (_output.nclasses() > 1) return GridSortBy.LOGLOSS; else return GridSortBy.RESDEV; } public static class GridSortBy { // intentionally not an enum to allow 3rd party extensions public static final GridSortBy LOGLOSS = new GridSortBy("logloss", false); public static final GridSortBy RESDEV = new GridSortBy("residual_deviance", false); public static final GridSortBy R2 = new GridSortBy("r2", true); public final String _name; public final boolean _decreasing; GridSortBy(String name, boolean decreasing) { _name = name; _decreasing = decreasing; } } public ToEigenVec getToEigenVec() { return null; } /** Model-specific parameter class. Each model sub-class contains * instance of one of these containing its builder parameters, with * model-specific parameters. E.g. KMeansModel extends Model and has a * KMeansParameters extending Model.Parameters; sample parameters include K, * whether or not to normalize, max iterations and the initial random seed. * * <p>The non-transient fields are input parameters to the model-building * process, and are considered "first class citizens" by the front-end - the * front-end will cache Parameters (in the browser, in JavaScript, on disk) * and rebuild Parameter instances from those caches. * * WARNING: Model Parameters is not immutable object and ModelBuilder can modify * them! */ public abstract static class Parameters extends Iced<Parameters> { /** Maximal number of supported levels in response. */ public static final int MAX_SUPPORTED_LEVELS = 1 << 20; /** The short name, used in making Keys. e.g. "GBM" */ abstract public String algoName(); /** The pretty algo name for this Model (e.g., Gradient Boosting Machine, rather than GBM).*/ abstract public String fullName(); /** The Java class name for this Model (e.g., hex.tree.gbm.GBM, rather than GBM).*/ abstract public String javaName(); /** Default relative tolerance for convergence-based early stopping */ protected double defaultStoppingTolerance() { return 1e-3; } /** How much work will be done for this model? */ abstract public long progressUnits(); public Key<Frame> _train; // User-Key of the Frame the Model is trained on public Key<Frame> _valid; // User-Key of the Frame the Model is validated on, if any public int _nfolds = 0; public boolean _keep_cross_validation_predictions = false; public boolean _keep_cross_validation_fold_assignment = false; public boolean _parallelize_cross_validation = true; public boolean _auto_rebalance = true; public void setTrain(Key<Frame> train) { this._train = train; } public enum FoldAssignmentScheme { AUTO, Random, Modulo, Stratified } public enum CategoricalEncodingScheme { AUTO(false), OneHotInternal(false), OneHotExplicit(false), Enum(false), Binary(false), Eigen( false), LabelEncoder(false), SortByResponse(true), EnumLimited(false); CategoricalEncodingScheme(boolean needResponse) { _needResponse = needResponse; } final boolean _needResponse; boolean needsResponse() { return _needResponse; } } public long _seed = -1; public long getOrMakeRealSeed() { while (_seed == -1) { _seed = RandomUtils.getRNG(System.nanoTime()).nextLong(); Log.debug( "Auto-generated time-based seed for pseudo-random number generator (because it was set to -1): " + _seed); } return _seed; } public FoldAssignmentScheme _fold_assignment = FoldAssignmentScheme.AUTO; public CategoricalEncodingScheme _categorical_encoding = CategoricalEncodingScheme.AUTO; public int _max_categorical_levels = 10; public DistributionFamily _distribution = DistributionFamily.AUTO; public double _tweedie_power = 1.5; public double _quantile_alpha = 0.5; public double _huber_alpha = 0.9; // TODO: This field belongs in the front-end column-selection process and // NOT in the parameters - because this requires all model-builders to have // column strip/ignore code. public String[] _ignored_columns; // column names to ignore for training public boolean _ignore_const_cols; // True if dropping constant cols public String _weights_column; public String _offset_column; public String _fold_column; public boolean _is_cv_model; //internal helper // Scoring a model on a dataset is not free; sometimes it is THE limiting // factor to model building. By default, partially built models are only // scored every so many major model iterations - throttled to limit scoring // costs to less than 10% of the build time. This flag forces scoring for // every iteration, allowing e.g. more fine-grained progress reporting. public boolean _score_each_iteration; /** * Maximum allowed runtime in seconds for model training. Use 0 to disable. */ public double _max_runtime_secs = 0; /** * Early stopping based on convergence of stopping_metric. * Stop if simple moving average of the stopping_metric does not improve by stopping_tolerance for * k scoring events. * Can only trigger after at least 2k scoring events. Use 0 to disable. */ public int _stopping_rounds = 0; /** * Metric to use for convergence checking, only for _stopping_rounds > 0. */ public ScoreKeeper.StoppingMetric _stopping_metric = ScoreKeeper.StoppingMetric.AUTO; /** * Relative tolerance for metric-based stopping criterion: stop if relative improvement is not at least this much. */ public double _stopping_tolerance = defaultStoppingTolerance(); /** Supervised models have an expected response they get to train with! */ public String _response_column; // response column name /** Should all classes be over/under-sampled to balance the class * distribution? */ public boolean _balance_classes = false; /** When classes are being balanced, limit the resulting dataset size to * the specified multiple of the original dataset size. Maximum relative * size of the training data after balancing class counts (can be less * than 1.0) */ public float _max_after_balance_size = 5.0f; /** * Desired over/under-sampling ratios per class (lexicographic order). * Only when balance_classes is enabled. * If not specified, they will be automatically computed to obtain class balance during training. */ public float[] _class_sampling_factors; /** For classification models, the maximum size (in terms of classes) of * the confusion matrix for it to be printed. This option is meant to * avoid printing extremely large confusion matrices. */ public int _max_confusion_matrix_size = 20; /** * A model key associated with a previously trained Deep Learning * model. This option allows users to build a new model as a * continuation of a previously generated model. */ public Key<? extends Model> _checkpoint; /** * A pretrained Autoencoder DL model with matching inputs and hidden layers * can be used to initialize the weights and biases (excluding the output layer). */ public Key<? extends Model> _pretrained_autoencoder; /** * Reference to custom metric function. */ public String _custom_metric_func = null; // Public no-arg constructor for reflective creation public Parameters() { _ignore_const_cols = defaultDropConsCols(); } /** @return the training frame instance */ public final Frame train() { return _train == null ? null : _train.get(); } /** @return the validation frame instance, or null * if a validation frame was not specified */ public final Frame valid() { return _valid == null ? null : _valid.get(); } /** Read-Lock both training and validation User frames. */ public void read_lock_frames(Job job) { Frame tr = train(); if (tr != null) tr.read_lock(job._key); if (_valid != null && !_train.equals(_valid)) _valid.get().read_lock(job._key); } /** Read-UnLock both training and validation User frames. This method is * called on crashing cleanup pathes, so handles the case where the frames * are not actually locked. */ public void read_unlock_frames(Job job) { Frame tr = train(); if (tr != null) tr.unlock(job._key, false); if (_valid != null && !_train.equals(_valid)) valid().unlock(job._key, false); } // Override in subclasses to change the default; e.g. true in GLM protected boolean defaultDropConsCols() { return true; } /** Type of missing columns during adaptation between train/test datasets * Overload this method for models that have sparse data handling - a zero * will preserve the sparseness. Otherwise, NaN is used. * @return real-valued number (can be NaN) */ public double missingColumnsType() { return Double.NaN; } public boolean hasCheckpoint() { return _checkpoint != null; } // FIXME: this is really horrible hack, Model.Parameters has method checksum_impl, // but not checksum, the API is totally random :( public long checksum() { return checksum_impl(); } /** * Compute a checksum based on all non-transient non-static ice-able assignable fields (incl. inherited ones) which have @API annotations. * Sort the fields first, since reflection gives us the fields in random order and we don't want the checksum to be affected by the field order. * NOTE: if a field is added to a Parameters class the checksum will differ even when all the previous parameters have the same value. If * a client wants backward compatibility they will need to compare parameter values explicitly. * * The method is motivated by standard hash implementation `hash = hash * P + value` but we use high prime numbers in random order. * @return checksum */ protected long checksum_impl() { long xs = 0x600DL; int count = 0; Field[] fields = Weaver.getWovenFields(this.getClass()); Arrays.sort(fields, new Comparator<Field>() { public int compare(Field field1, Field field2) { return field1.getName().compareTo(field2.getName()); } }); for (Field f : fields) { final long P = MathUtils.PRIMES[count % MathUtils.PRIMES.length]; Class<?> c = f.getType(); if (c.isArray()) { try { f.setAccessible(true); if (f.get(this) != null) { if (c.getComponentType() == Integer.TYPE) { int[] arr = (int[]) f.get(this); xs = xs * P + (long) Arrays.hashCode(arr); } else if (c.getComponentType() == Float.TYPE) { float[] arr = (float[]) f.get(this); xs = xs * P + (long) Arrays.hashCode(arr); } else if (c.getComponentType() == Double.TYPE) { double[] arr = (double[]) f.get(this); xs = xs * P + (long) Arrays.hashCode(arr); } else if (c.getComponentType() == Long.TYPE) { long[] arr = (long[]) f.get(this); xs = xs * P + (long) Arrays.hashCode(arr); } else { Object[] arr = (Object[]) f.get(this); xs = xs * P + (long) Arrays.deepHashCode(arr); } //else lead to ClassCastException } else { xs = xs * P; } } catch (IllegalAccessException e) { throw new RuntimeException(e); } catch (ClassCastException t) { throw H2O.fail(); //no support yet for int[][] etc. } } else { try { f.setAccessible(true); Object value = f.get(this); if (value != null) { xs = xs * P + (long) (value.hashCode()); } else { xs = xs * P + P; } } catch (IllegalAccessException e) { throw new RuntimeException(e); } } count++; } xs ^= (train() == null ? 43 : train().checksum()) * (valid() == null ? 17 : valid().checksum()); return xs; } } public ModelMetrics addModelMetrics(final ModelMetrics mm) { DKV.put(mm); incrementModelMetrics(_output, mm._key); return mm; } static void incrementModelMetrics(Output out, Key k) { synchronized (out) { for (Key key : out._model_metrics) if (k.equals(key)) return; out._model_metrics = Arrays.copyOf(out._model_metrics, out._model_metrics.length + 1); out._model_metrics[out._model_metrics.length - 1] = k; } } public void addWarning(String s) { _warnings = Arrays.copyOf(_warnings, _warnings.length + 1); _warnings[_warnings.length - 1] = s; } public static class InteractionSpec extends Iced { private String[] _columns; private StringPair[] _pairs; private InteractionSpec(String[] columns, StringPair[] pairs) { _columns = columns; _pairs = pairs; } public static InteractionSpec allPairwise(String[] columns) { return columns != null ? new InteractionSpec(columns, null) : null; } public static InteractionSpec create(String[] columns, StringPair[] pairs) { return columns == null && pairs == null ? null : new InteractionSpec(columns, pairs); } public boolean isEmpty() { return _columns == null && _pairs == null; } public Model.InteractionPair[] makeInteractionPairs(Frame f) { if (isEmpty()) return null; InteractionPair[] allPairwise = null; InteractionPair[] allExplicit = null; int[] interactionIDs = new int[0]; if (_columns != null) { interactionIDs = new int[_columns.length]; for (int i = 0; i < _columns.length; ++i) { interactionIDs[i] = f.find(_columns[i]); if (interactionIDs[i] == -1) throw new IllegalArgumentException( "missing column from the dataset, could not make interaction: " + interactionIDs[i]); } allPairwise = Model.InteractionPair.generatePairwiseInteractionsFromList(interactionIDs); } if (_pairs != null) { Arrays.sort(interactionIDs); allExplicit = new InteractionPair[_pairs.length]; int n = 0; for (StringPair p : _pairs) { int aIdx = f.find(p._a); if (aIdx == -1) throw new IllegalArgumentException( "Invalid interactions specified (first column is missing): " + p.toJsonString()); int bIdx = f.find(p._b); if (bIdx == -1) throw new IllegalArgumentException( "Invalid interactions specified (second column is missing): " + p.toJsonString()); if (Arrays.binarySearch(interactionIDs, aIdx) >= 0 && Arrays.binarySearch(interactionIDs, bIdx) >= 0) continue; // This interaction is already included in set of all pairwise interactions allExplicit[n++] = new InteractionPair(aIdx, bIdx, null, null); } if (n != allExplicit.length) { InteractionPair[] resized = new InteractionPair[n]; System.arraycopy(allExplicit, 0, resized, 0, resized.length); allExplicit = resized; } } if (allExplicit == null) return allPairwise; else return ArrayUtils.append(allPairwise, allExplicit); } } /** Model-specific output class. Each model sub-class contains an instance * of one of these containing its "output": the pieces of the model needed * for scoring. E.g. KMeansModel has a KMeansOutput extending Model.Output * which contains the cluster centers. The output also includes the names, * domains and other fields which are determined at training time. */ public abstract static class Output extends Iced { /** Columns used in the model and are used to match up with scoring data * columns. The last name is the response column name (if any). */ public String _names[]; public void setNames(String[] names) { _names = names; } public String _origNames[]; /** Categorical/factor mappings, per column. Null for non-categorical cols. * Columns match the post-init cleanup columns. The last column holds the * response col categoricals for SupervisedModels. */ public String _domains[][]; public String _origDomains[][]; /** List of Keys to cross-validation models (non-null iff _parms._nfolds > 1 or _parms._fold_column != null) **/ public Key _cross_validation_models[]; /** List of Keys to cross-validation predictions (if requested) **/ public Key _cross_validation_predictions[]; public Key<Frame> _cross_validation_holdout_predictions_frame_id; public Key<Frame> _cross_validation_fold_assignment_frame_id; // Model-specific start/end/run times // Each individual model's start/end/run time is reported here, not the total time to build N+1 cross-validation models, or all grid models public long _start_time; public long _end_time; public long _run_time; protected void startClock() { _start_time = System.currentTimeMillis(); } protected void stopClock() { _end_time = System.currentTimeMillis(); _run_time = _end_time - _start_time; } public Output() { this(false, false, false); } public Output(boolean hasWeights, boolean hasOffset, boolean hasFold) { _hasWeights = hasWeights; _hasOffset = hasOffset; _hasFold = hasFold; } /** Any final prep-work just before model-building starts, but after the * user has clicked "go". E.g., converting a response column to an categorical * touches the entire column (can be expensive), makes a parallel vec * (Key/Data leak management issues), and might throw IAE if there are too * many classes. */ public Output(ModelBuilder b) { _isSupervised = b.isSupervised(); if (b.error_count() > 0) throw new IllegalArgumentException(b.validationErrors()); // Capture the data "shape" the model is valid on setNames(b._train != null ? b._train.names() : new String[0]); _domains = b._train != null ? b._train.domains() : new String[0][]; _origNames = b._origNames; _origDomains = b._origDomains; _hasOffset = b.hasOffsetCol(); _hasWeights = b.hasWeightCol(); _hasFold = b.hasFoldCol(); _distribution = b._distribution; _priorClassDist = b._priorClassDist; assert (_job == null); // only set after job completion } /** Returns number of input features (OK for most supervised methods, need to override for unsupervised!) */ public int nfeatures() { return _names.length - (_hasOffset ? 1 : 0) - (_hasWeights ? 1 : 0) - (_hasFold ? 1 : 0) - (isSupervised() ? 1 : 0); } /** List of all the associated ModelMetrics objects, so we can delete them * when we delete this model. */ Key[] _model_metrics = new Key[0]; /** Job info: final status (canceled, crashed), build time */ public Job _job; /** * Training set metrics obtained during model training */ public ModelMetrics _training_metrics; /** * Validation set metrics obtained during model training (if a validation data set was specified) */ public ModelMetrics _validation_metrics; /** * Cross-Validation metrics obtained during model training */ public ModelMetrics _cross_validation_metrics; /** * Summary of cross-validation metrics of all k-fold models */ public TwoDimTable _cross_validation_metrics_summary; /** * User-facing model summary - Display model type, complexity, size and other useful stats */ public TwoDimTable _model_summary; /** * User-facing model scoring history - 2D table with modeling accuracy as a function of time/trees/epochs/iterations, etc. */ public TwoDimTable _scoring_history; public double[] _distribution; public double[] _modelClassDist; public double[] _priorClassDist; protected boolean _isSupervised; public boolean isSupervised() { return _isSupervised; } /** The name of the response column (which is always the last column). */ protected final boolean _hasOffset; // weights and offset are kept at designated position in the names array protected final boolean _hasWeights;// only need to know if we have them protected final boolean _hasFold;// only need to know if we have them public boolean hasOffset() { return _hasOffset; } public boolean hasWeights() { return _hasWeights; } public boolean hasFold() { return _hasFold; } public String responseName() { return isSupervised() ? _names[responseIdx()] : null; } public String weightsName() { return _hasWeights ? _names[weightsIdx()] : null; } public String offsetName() { return _hasOffset ? _names[offsetIdx()] : null; } public String foldName() { return _hasFold ? _names[foldIdx()] : null; } public InteractionSpec interactions() { return null; } // Vec layout is [c1,c2,...,cn,w?,o?,r], cn are predictor cols, r is response, w and o are weights and offset, both are optional public int weightsIdx() { if (!_hasWeights) return -1; return _names.length - (isSupervised() ? 1 : 0) - (hasOffset() ? 1 : 0) - 1 - (hasFold() ? 1 : 0); } public int offsetIdx() { if (!_hasOffset) return -1; return _names.length - (isSupervised() ? 1 : 0) - (hasFold() ? 1 : 0) - 1; } public int foldIdx() { if (!_hasFold) return -1; return _names.length - (isSupervised() ? 1 : 0) - 1; } public int responseIdx() { if (!isSupervised()) return -1; return _names.length - 1; } /** Names of levels for a categorical response column. */ public String[] classNames() { if (_domains == null || _domains.length == 0 || !isSupervised()) return null; return _domains[_domains.length - 1]; } /** Is this model a classification model? (v. a regression or clustering model) */ public boolean isClassifier() { return isSupervised() && nclasses() > 1; } /** Is this model a binomial classification model? (v. a regression or clustering model) */ public boolean isBinomialClassifier() { return isSupervised() && nclasses() == 2; } /**Is this model a multinomial classification model (supervised and nclasses() > 2 */ public boolean isMultinomialClassifier() { return isSupervised() && nclasses() > 2; } /** Number of classes in the response column if it is categorical and the model is supervised. */ public int nclasses() { String cns[] = classNames(); return cns == null ? 1 : cns.length; } // Note: some algorithms MUST redefine this method to return other model categories public ModelCategory getModelCategory() { if (isSupervised()) return (isClassifier() ? (nclasses() > 2 ? ModelCategory.Multinomial : ModelCategory.Binomial) : ModelCategory.Regression); return ModelCategory.Unknown; } public boolean isAutoencoder() { return false; } // Override in DeepLearning and so on. public synchronized void clearModelMetrics() { _model_metrics = new Key[0]; } public synchronized Key<ModelMetrics>[] getModelMetrics() { return Arrays.copyOf(_model_metrics, _model_metrics.length); } protected long checksum_impl() { return (null == _names ? 13 : Arrays.hashCode(_names)) * (null == _domains ? 17 : Arrays.deepHashCode(_domains)) * getModelCategory().ordinal(); } public void printTwoDimTables(StringBuilder sb, Object o) { for (Field f : Weaver.getWovenFields(o.getClass())) { Class<?> c = f.getType(); if (c.isAssignableFrom(TwoDimTable.class)) { try { TwoDimTable t = (TwoDimTable) f.get(this); f.setAccessible(true); if (t != null) sb.append(t.toString(1, false /*don't print the full table if too long*/)); } catch (IllegalAccessException e) { e.printStackTrace(); } } } } @Override public String toString() { StringBuilder sb = new StringBuilder(); if (_training_metrics != null) sb.append(_training_metrics.toString()); if (_validation_metrics != null) sb.append(_validation_metrics.toString()); if (_cross_validation_metrics != null) sb.append(_cross_validation_metrics.toString()); printTwoDimTables(sb, this); return sb.toString(); } } // Output protected String[][] scoringDomains() { return _output._domains; } public ModelMetrics addMetrics(ModelMetrics mm) { return addModelMetrics(mm); } public abstract ModelMetrics.MetricBuilder makeMetricBuilder(String[] domain); /** Full constructor */ public Model(Key<M> selfKey, P parms, O output) { super(selfKey); assert parms != null; _parms = parms; _output = output; // Output won't be set if we're assert output != null; if (_output != null) _output.startClock(); _dist = isSupervised() && _output.nclasses() == 1 ? new Distribution(_parms) : null; } /** * Deviance of given distribution function at predicted value f * @param w observation weight * @param y (actual) response * @param f (predicted) response in original response space * @return value of gradient */ public double deviance(double w, double y, double f) { return _dist.deviance(w, y, f); } public ScoringInfo[] scoring_history() { return scoringInfo; } /** * Fill a ScoringInfo with data from the ModelMetrics for this model. * @param scoringInfo */ public void fillScoringInfo(ScoringInfo scoringInfo) { scoringInfo.is_classification = this._output.isClassifier(); scoringInfo.is_autoencoder = _output.isAutoencoder(); scoringInfo.scored_train = new ScoreKeeper(this._output._training_metrics); scoringInfo.scored_valid = new ScoreKeeper(this._output._validation_metrics); scoringInfo.scored_xval = new ScoreKeeper(this._output._cross_validation_metrics); scoringInfo.validation = _output._validation_metrics != null; scoringInfo.cross_validation = _output._cross_validation_metrics != null; if (this._output.isBinomialClassifier()) { scoringInfo.training_AUC = this._output._training_metrics == null ? null : ((ModelMetricsBinomial) this._output._training_metrics)._auc; scoringInfo.validation_AUC = this._output._validation_metrics == null ? null : ((ModelMetricsBinomial) this._output._validation_metrics)._auc; } } // return the most up-to-date model metrics public ScoringInfo last_scored() { return scoringInfo == null ? null : scoringInfo[scoringInfo.length - 1]; } // Lower is better public float loss() { switch (_parms._stopping_metric) { case MSE: return (float) mse(); case MAE: return (float) mae(); case RMSLE: return (float) rmsle(); case logloss: return (float) logloss(); case deviance: return (float) deviance(); case misclassification: return (float) classification_error(); case AUC: return (float) (1 - auc()); case mean_per_class_error: return (float) mean_per_class_error(); case lift_top_group: return (float) lift_top_group(); case AUTO: default: return (float) (_output.isClassifier() ? logloss() : _output.isAutoencoder() ? mse() : deviance()); } } // loss() public int compareTo(M o) { if (o._output.isClassifier() != _output.isClassifier()) throw new UnsupportedOperationException("Cannot compare classifier against regressor."); if (o._output.isClassifier()) { if (o._output.nclasses() != _output.nclasses()) throw new UnsupportedOperationException("Cannot compare models with different number of classes."); } return (loss() < o.loss() ? -1 : loss() > o.loss() ? 1 : 0); } public double classification_error() { if (scoringInfo != null) return last_scored().cross_validation ? last_scored().scored_xval._classError : last_scored().validation ? last_scored().scored_valid._classError : last_scored().scored_train._classError; ModelMetrics mm = _output._cross_validation_metrics != null ? _output._cross_validation_metrics : _output._validation_metrics != null ? _output._validation_metrics : _output._training_metrics; if (mm == null) return Double.NaN; if (mm instanceof ModelMetricsBinomial) { return ((ModelMetricsBinomial) mm)._auc.defaultErr(); } else if (mm instanceof ModelMetricsMultinomial) { return ((ModelMetricsMultinomial) mm)._cm.err(); } return Double.NaN; } public double mse() { if (scoringInfo != null) return last_scored().cross_validation ? last_scored().scored_xval._mse : last_scored().validation ? last_scored().scored_valid._mse : last_scored().scored_train._mse; ModelMetrics mm = _output._cross_validation_metrics != null ? _output._cross_validation_metrics : _output._validation_metrics != null ? _output._validation_metrics : _output._training_metrics; if (mm == null) return Double.NaN; return mm.mse(); } public double mae() { if (scoringInfo != null) return last_scored().cross_validation ? last_scored().scored_xval._mae : last_scored().validation ? last_scored().scored_valid._mae : last_scored().scored_train._mae; ModelMetrics mm = _output._cross_validation_metrics != null ? _output._cross_validation_metrics : _output._validation_metrics != null ? _output._validation_metrics : _output._training_metrics; if (mm == null) return Double.NaN; return ((ModelMetricsRegression) mm).mae(); } public double rmsle() { if (scoringInfo != null) return last_scored().cross_validation ? last_scored().scored_xval._rmsle : last_scored().validation ? last_scored().scored_valid._rmsle : last_scored().scored_train._rmsle; ModelMetrics mm = _output._cross_validation_metrics != null ? _output._cross_validation_metrics : _output._validation_metrics != null ? _output._validation_metrics : _output._training_metrics; if (mm == null) return Double.NaN; return ((ModelMetricsRegression) mm).rmsle(); } public double auc() { if (scoringInfo != null) return last_scored().cross_validation ? last_scored().scored_xval._AUC : last_scored().validation ? last_scored().scored_valid._AUC : last_scored().scored_train._AUC; ModelMetrics mm = _output._cross_validation_metrics != null ? _output._cross_validation_metrics : _output._validation_metrics != null ? _output._validation_metrics : _output._training_metrics; if (mm == null) return Double.NaN; return ((ModelMetricsBinomial) mm)._auc._auc; } public double deviance() { if (scoringInfo != null) return last_scored().cross_validation ? last_scored().scored_xval._mean_residual_deviance : last_scored().validation ? last_scored().scored_valid._mean_residual_deviance : last_scored().scored_train._mean_residual_deviance; ModelMetrics mm = _output._cross_validation_metrics != null ? _output._cross_validation_metrics : _output._validation_metrics != null ? _output._validation_metrics : _output._training_metrics; if (mm == null) return Double.NaN; return ((ModelMetricsRegression) mm)._mean_residual_deviance; } public double logloss() { if (scoringInfo != null) return last_scored().cross_validation ? last_scored().scored_xval._logloss : last_scored().validation ? last_scored().scored_valid._logloss : last_scored().scored_train._logloss; ModelMetrics mm = _output._cross_validation_metrics != null ? _output._cross_validation_metrics : _output._validation_metrics != null ? _output._validation_metrics : _output._training_metrics; if (mm == null) return Double.NaN; if (mm instanceof ModelMetricsBinomial) { return ((ModelMetricsBinomial) mm).logloss(); } else if (mm instanceof ModelMetricsMultinomial) { return ((ModelMetricsMultinomial) mm).logloss(); } return Double.NaN; } public double mean_per_class_error() { if (scoringInfo != null) return last_scored().cross_validation ? last_scored().scored_xval._mean_per_class_error : last_scored().validation ? last_scored().scored_valid._mean_per_class_error : last_scored().scored_train._mean_per_class_error; ModelMetrics mm = _output._cross_validation_metrics != null ? _output._cross_validation_metrics : _output._validation_metrics != null ? _output._validation_metrics : _output._training_metrics; if (mm == null) return Double.NaN; if (mm instanceof ModelMetricsBinomial) { return ((ModelMetricsBinomial) mm).mean_per_class_error(); } else if (mm instanceof ModelMetricsMultinomial) { return ((ModelMetricsMultinomial) mm).mean_per_class_error(); } return Double.NaN; } public double lift_top_group() { if (scoringInfo != null) return last_scored().cross_validation ? last_scored().scored_xval._lift : last_scored().validation ? last_scored().scored_valid._lift : last_scored().scored_train._lift; ModelMetrics mm = _output._cross_validation_metrics != null ? _output._cross_validation_metrics : _output._validation_metrics != null ? _output._validation_metrics : _output._training_metrics; if (mm == null) return Double.NaN; if (mm instanceof ModelMetricsBinomial) { GainsLift gl = ((ModelMetricsBinomial) mm)._gainsLift; if (gl != null && gl.response_rates != null && gl.response_rates.length > 0) { return gl.response_rates[0] / gl.avg_response_rate; } } return Double.NaN; } /** Adapt a Test/Validation Frame to be compatible for a Training Frame. The * intention here is that ModelBuilders can assume the test set has the same * count of columns, and within each factor column the same set of * same-numbered levels. Extra levels are renumbered past those in the * Train set but will still be present in the Test set, thus requiring * range-checking. * * This routine is used before model building (with no Model made yet) to * check for compatible datasets, and also used to prepare a large dataset * for scoring (with a Model). * * Adaption does the following things: * - Remove any "extra" Vecs appearing only in the test and not the train * - Insert any "missing" Vecs appearing only in the train and not the test * with all NAs ({@see missingColumnsType}). This will issue a warning, * and if the "expensive" flag is false won't actually make the column * replacement column but instead will bail-out on the whole adaption (but * will continue looking for more warnings). * - If all columns are missing, issue an error. * - Renumber matching cat levels to match the Train levels; this might make * "holes" in the Test set cat levels, if some are not in the Test set. * - Extra Test levels are renumbered past the end of the Train set, hence * the train and test levels match up to all the train levels; there might * be extra Test levels past that. * - For all mis-matched levels, issue a warning. * * The {@code test} frame is updated in-place to be compatible, by altering * the names and Vecs; make a defensive copy if you do not want it modified. * There is a fast-path cutout if the test set is already compatible. Since * the test-set is conditionally modifed with extra CategoricalWrappedVec optionally * added it is recommended to use a Scope enter/exit to track Vec lifetimes. * * @param test Testing Frame, updated in-place * @param expensive Try hard to adapt; this might involve the creation of * whole Vecs and thus get expensive. If {@code false}, then only adapt if * no warnings and errors; otherwise just the messages are produced. * Created Vecs have to be deleted by the caller (e.g. Scope.enter/exit). * @return Array of warnings; zero length (never null) for no warnings. * Throws {@code IllegalArgumentException} if no columns are in common, or * if any factor column has no levels in common. */ public String[] adaptTestForTrain(Frame test, boolean expensive, boolean computeMetrics) { return adaptTestForTrain(test, _output._origNames, _output._origDomains, _output._names, _output._domains, _parms, expensive, computeMetrics, _output.interactions(), getToEigenVec(), _toDelete, false); } /** * @param test Frame to be adapted * @param origNames Training column names before categorical column encoding - can be the same as names * @param origDomains Training column levels before categorical column encoding - can be the same as domains * @param names Training column names * @param domains Training column levels * @param parms Model parameters * @param expensive Whether to actually do the hard work * @param computeMetrics Whether metrics can be (and should be) computed * @param interactions Column names to create pairwise interactions with * @param catEncoded Whether the categorical columns of the test frame were already transformed via categorical_encoding */ public static String[] adaptTestForTrain(Frame test, String[] origNames, String[][] origDomains, String[] names, String[][] domains, Parameters parms, boolean expensive, boolean computeMetrics, InteractionSpec interactions, ToEigenVec tev, IcedHashMap<Key, String> toDelete, boolean catEncoded) throws IllegalArgumentException { String[] msg = new String[0]; if (test == null) return msg; if (catEncoded && origNames == null) return msg; // test frame matches the training frame (after categorical encoding, if applicable) String[][] tdomains = test.domains(); if (names == test._names && domains == tdomains || (Arrays.equals(names, test._names) && Arrays.deepEquals(domains, tdomains))) return msg; String[] backupNames = names; String[][] backupDomains = domains; final String weights = parms._weights_column; final String offset = parms._offset_column; final String fold = parms._fold_column; final String response = parms._response_column; // whether we need to be careful with categorical encoding - the test frame could be either in original state or in encoded state final boolean checkCategoricals = parms._categorical_encoding == Parameters.CategoricalEncodingScheme.OneHotExplicit || parms._categorical_encoding == Parameters.CategoricalEncodingScheme.Eigen || parms._categorical_encoding == Parameters.CategoricalEncodingScheme.Binary; // test frame matches the user-given frame (before categorical encoding, if applicable) if (checkCategoricals && origNames != null) { boolean match = Arrays.equals(origNames, test._names); if (!match) { match = true; // In case the test set has extra columns not in the training set - check that all original pre-encoding columns are available in the test set // We could be lenient here and fill missing columns with NA, but then it gets difficult to decide whether this frame is pre/post encoding, if a certain fraction of columns mismatch... for (String s : origNames) { match &= ArrayUtils.contains(test.names(), s); if (!match) break; } } // still have work to do below, make sure we set the names/domains to the original user-given values such that we can do the int->enum mapping and cat. encoding below (from scratch) if (match) { names = origNames; domains = origDomains; } } // create the interactions now and bolt them on to the front of the test Frame if (null != interactions) { InteractionPair[] interactionPairs = interactions.makeInteractionPairs(test); test.add(makeInteractions(test, false, interactionPairs, true, true, false)); } // Build the validation set to be compatible with the training set. // Toss out extra columns, complain about missing ones, remap categoricals ArrayList<String> msgs = new ArrayList<>(); Vec vvecs[] = new Vec[names.length]; int good = 0; // Any matching column names, at all? int convNaN = 0; // count of columns that were replaced with NA for (int i = 0; i < names.length; i++) { Vec vec = test.vec(names[i]); // Search in the given validation set boolean isResponse = response != null && names[i].equals(response); boolean isWeights = weights != null && names[i].equals(weights); boolean isOffset = offset != null && names[i].equals(offset); boolean isFold = fold != null && names[i].equals(fold); // If a training set column is missing in the test set, complain (if it's ok, fill in with NAs (or 0s if it's a fold-column)) if (vec == null) { if (isResponse && computeMetrics) throw new IllegalArgumentException( "Test/Validation dataset is missing response column '" + response + "'"); else if (isOffset) throw new IllegalArgumentException( H2O.technote(12, "Test/Validation dataset is missing offset column '" + offset + "'. If your intention is to disable the effect of the offset add a zero offset column.")); else if (isWeights && computeMetrics) { if (expensive) { vec = test.anyVec().makeCon(1); toDelete.put(vec._key, "adapted missing vectors"); msgs.add(H2O.technote(1, "Test/Validation dataset is missing weights column '" + names[i] + "' (needed because a response was found and metrics are to be computed): substituting in a column of 1s")); } } else if (expensive) { // generate warning even for response columns. Other tests depended on this. final double defval; if (isWeights) defval = 1; // note: even though computeMetrics is false we should still have sensible weights (GLM skips rows with NA weights) else if (isFold) defval = 0; else { defval = parms.missingColumnsType(); convNaN++; } String str = "Test/Validation dataset is missing column '" + names[i] + "': substituting in a column of " + defval; vec = test.anyVec().makeCon(defval); toDelete.put(vec._key, "adapted missing vectors"); msgs.add(str); } } if (vec != null) { // I have a column with a matching name if (domains[i] != null) { // Model expects an categorical if (vec.isString()) vec = VecUtils.stringToCategorical(vec); //turn a String column into a categorical column (we don't delete the original vec here) if (expensive && vec.domain() != domains[i] && !Arrays.equals(vec.domain(), domains[i])) { // Result needs to be the same categorical Vec evec; try { evec = vec.adaptTo(domains[i]); // Convert to categorical or throw IAE toDelete.put(evec._key, "categorically adapted vec"); } catch (NumberFormatException nfe) { throw new IllegalArgumentException( "Test/Validation dataset has a non-categorical column '" + names[i] + "' which is categorical in the training data"); } String[] ds = evec.domain(); assert ds != null && ds.length >= domains[i].length; if (isResponse && vec.domain() != null && ds.length == domains[i].length + vec.domain().length) throw new IllegalArgumentException( "Test/Validation dataset has a categorical response column '" + names[i] + "' with no levels in common with the model"); if (ds.length > domains[i].length) msgs.add("Test/Validation dataset column '" + names[i] + "' has levels not trained on: " + Arrays.toString(Arrays.copyOfRange(ds, domains[i].length, ds.length))); vec = evec; } } else if (vec.isCategorical()) { if (parms._categorical_encoding == Parameters.CategoricalEncodingScheme.LabelEncoder) { Vec evec = vec.toNumericVec(); toDelete.put(evec._key, "label encoded vec"); vec = evec; } else { throw new IllegalArgumentException("Test/Validation dataset has categorical column '" + names[i] + "' which is real-valued in the training data"); } } good++; // Assumed compatible; not checking e.g. Strings vs UUID } vvecs[i] = vec; } if (good == names.length || (response != null && test.find(response) == -1 && good == names.length - 1)) // Only update if got something for all columns test.restructure(names, vvecs, good); boolean haveCategoricalPredictors = false; if (expensive && checkCategoricals && !catEncoded) { for (int i = 0; i < test.numCols(); ++i) { if (test.names()[i].equals(response)) continue; if (test.names()[i].equals(weights)) continue; if (test.names()[i].equals(offset)) continue; if (test.names()[i].equals(fold)) continue; // either the column of the test set is categorical (could be a numeric col that's already turned into a factor) if (test.vec(i).cardinality() > 0) { haveCategoricalPredictors = true; break; } // or a equally named column of the training set is categorical, but the test column isn't (e.g., numeric column provided to be converted to a factor) int whichCol = ArrayUtils.find(names, test.name(i)); if (whichCol >= 0 && domains[whichCol] != null) { haveCategoricalPredictors = true; break; } } } // check if we first need to expand categoricals before calling this method again if (expensive && !catEncoded && haveCategoricalPredictors) { Frame updated = categoricalEncoder(test, new String[] { weights, offset, fold, response }, parms._categorical_encoding, tev, parms._max_categorical_levels); toDelete.put(updated._key, "categorically encoded frame"); test.restructure(updated.names(), updated.vecs()); //updated in place String[] msg2 = adaptTestForTrain(test, origNames, origDomains, backupNames, backupDomains, parms, expensive, computeMetrics, interactions, tev, toDelete, true /*catEncoded*/); msgs.addAll(Arrays.asList(msg2)); return msgs.toArray(new String[msgs.size()]); } if (good == convNaN) throw new IllegalArgumentException( "Test/Validation dataset has no columns in common with the training set"); return msgs.toArray(new String[msgs.size()]); } /** * Bulk score the frame, and auto-name the resulting predictions frame. * @see #score(Frame, String) * @param fr frame which should be scored * @return A new frame containing a predicted values. For classification it * contains a column with prediction and distribution for all * response classes. For regression it contains only one column with * predicted values. * @throws IllegalArgumentException */ public Frame score(Frame fr) throws IllegalArgumentException { return score(fr, null, null, true); } /** Bulk score the frame {@code fr}, producing a Frame result; the 1st * Vec is the predicted class, the remaining Vecs are the probability * distributions. For Regression (single-class) models, the 1st and only * Vec is the prediction value. The result is in the DKV; caller is * responsible for deleting. * * @param fr frame which should be scored * @param destination_key store prediction frame under give key * @param customMetricFunc function to produce adhoc scoring metrics if actuals are presented * @return A new frame containing a predicted values. For classification it * contains a column with prediction and distribution for all * response classes. For regression it contains only one column with * predicted values. * @throws IllegalArgumentException */ public Frame score(Frame fr, String destination_key, CFuncRef customMetricFunc) throws IllegalArgumentException { return score(fr, destination_key, null, true, customMetricFunc); } public Frame score(Frame fr, String destination_key) throws IllegalArgumentException { return score(fr, destination_key, null, true); } public Frame score(Frame fr, String destination_key, Job j) throws IllegalArgumentException { return score(fr, destination_key, j, true); } public void addWarningP(String s) { _warningsP = Arrays.copyOf(_warningsP, _warningsP.length + 1); _warningsP[_warningsP.length - 1] = s; } public boolean containsResponse(String s, String responseName) { Pattern pat = Pattern.compile("'(.*?)'"); Matcher match = pat.matcher(s); if (match.find() && responseName.equals(match.group(1))) { return true; } return false; } public Frame score(Frame fr, String destination_key, Job j, boolean computeMetrics) throws IllegalArgumentException { return score(fr, destination_key, j, computeMetrics, CFuncRef.NOP); } public Frame score(Frame fr, String destination_key, Job j, boolean computeMetrics, CFuncRef customMetricFunc) throws IllegalArgumentException { Frame adaptFr = new Frame(fr); computeMetrics = computeMetrics && (!isSupervised() || (adaptFr.vec(_output.responseName()) != null && !adaptFr.vec(_output.responseName()).isBad())); String[] msg = adaptTestForTrain(adaptFr, true, computeMetrics); // Adapt // clean up the previous score warning messages _warningsP = new String[0]; if (msg.length > 0) { for (String s : msg) { if ((_output.responseName() == null) || !containsResponse(s, _output.responseName())) { // response column missing will not generate warning for prediction addWarningP(s); // add warning string to model Log.warn(s); } } } Frame output = predictScoreImpl(fr, adaptFr, destination_key, j, computeMetrics, customMetricFunc); // Predict & Score // Log modest confusion matrices Vec predicted = output.vecs()[0]; // Modeled/predicted response String mdomain[] = predicted.domain(); // Domain of predictions (union of test and train) // Output is in the model's domain, but needs to be mapped to the scored // dataset's domain. if (_output.isClassifier() && computeMetrics) { /* if (false) { assert(mdomain != null); // label must be categorical ModelMetrics mm = ModelMetrics.getFromDKV(this,fr); ConfusionMatrix cm = mm.cm(); if (cm != null && cm._domain != null) //don't print table for regression if( cm._cm.length < _parms._max_confusion_matrix_size ) { // Print size limitation Log.info(cm.table().toString(1)); } if (mm.hr() != null) { Log.info(getHitRatioTable(mm.hr())); } } */ Vec actual = fr.vec(_output.responseName()); if (actual != null) { // Predict does not have an actual, scoring does String sdomain[] = actual.domain(); // Scored/test domain; can be null if (sdomain != null && mdomain != sdomain && !Arrays.equals(mdomain, sdomain)) output.replace(0, new CategoricalWrappedVec(actual.group().addVec(), actual._rowLayout, sdomain, predicted._key)); } } Frame.deleteTempFrameAndItsNonSharedVecs(adaptFr, fr); return output; } /** * Compute the deviances for each observation * @param valid Validation Frame (must contain the response) * @param predictions Predictions made by the model * @param outputName Name of the output frame * @return Frame containing 1 column with the per-row deviances */ public Frame computeDeviances(Frame valid, Frame predictions, String outputName) { assert (_parms._response_column != null) : "response column can't be null"; assert valid.find(_parms._response_column) >= 0 : "validation frame must contain a response column"; predictions.add(_parms._response_column, valid.vec(_parms._response_column)); if (valid.find(_parms._weights_column) >= 0) predictions.add(_parms._weights_column, valid.vec(_parms._weights_column)); final int respIdx = predictions.find(_parms._response_column); final int weightIdx = predictions.find(_parms._weights_column); final Distribution myDist = _dist == null ? null : IcedUtils.deepCopy(_dist); if (myDist != null && myDist.distribution == DistributionFamily.huber) { myDist.setHuberDelta(hex.ModelMetricsRegression.computeHuberDelta(valid.vec(_parms._response_column), //actual predictions.vec(0), //predictions valid.vec(_parms._weights_column), //weight _parms._huber_alpha)); } return new MRTask() { @Override public void map(Chunk[] cs, NewChunk[] nc) { Chunk weight = weightIdx >= 0 ? cs[weightIdx] : new C0DChunk(1, cs[0]._len); Chunk response = cs[respIdx]; for (int i = 0; i < cs[0]._len; ++i) { double w = weight.atd(i); double y = response.atd(i); if (_output.nclasses() == 1) { //regression - deviance double f = cs[0].atd(i); if (myDist != null && myDist.distribution == DistributionFamily.huber) { nc[0].addNum(myDist.deviance(w, y, f)); //use above custom huber delta for this dataset } else { nc[0].addNum(deviance(w, y, f)); } } else { int iact = (int) y; double err = iact < _output.nclasses() ? 1 - cs[1 + iact].atd(i) : 1; nc[0].addNum(w * MathUtils.logloss(err)); } } } }.doAll(Vec.T_NUM, predictions).outputFrame(Key.<Frame>make(outputName), new String[] { "deviance" }, null); } protected String[] makeScoringNames() { return makeScoringNames(_output); } public static <O extends Model.Output> String[] makeScoringNames(O output) { final int nc = output.nclasses(); final int ncols = nc == 1 ? 1 : nc + 1; // Regression has 1 predict col; classification also has class distribution String[] names = new String[ncols]; names[0] = "predict"; for (int i = 1; i < names.length; ++i) { names[i] = output.classNames()[i - 1]; // turn integer class labels such as 0, 1, etc. into p0, p1, etc. try { Integer.valueOf(names[i]); names[i] = "p" + names[i]; } catch (Throwable t) { // do nothing, non-integer names are fine already } } return names; } /** Allow subclasses to define their own BigScore class. */ protected BigScore makeBigScoreTask(String[][] domains, String[] names, Frame adaptFrm, boolean computeMetrics, boolean makePrediction, Job j, CFuncRef customMetricFunc) { return new BigScore(domains[0], names != null ? names.length : 0, adaptFrm.means(), _output.hasWeights() && adaptFrm.find(_output.weightsName()) >= 0, computeMetrics, makePrediction, j, customMetricFunc); } /** Score an already adapted frame. Returns a new Frame with new result * vectors, all in the DKV. Caller responsible for deleting. Input is * already adapted to the Model's domain, so the output is also. Also * computes the metrics for this frame. * * @param adaptFrm Already adapted frame * @param computeMetrics * @return A Frame containing the prediction column, and class distribution */ protected Frame predictScoreImpl(Frame fr, Frame adaptFrm, String destination_key, Job j, boolean computeMetrics, CFuncRef customMetricFunc) { // Build up the names & domains. String[] names = makeScoringNames(); String[][] domains = new String[names.length][]; domains[0] = names.length == 1 ? null : !computeMetrics ? _output._domains[_output._domains.length - 1] : adaptFrm.lastVec().domain(); if (_parms._distribution == DistributionFamily.quasibinomial) { domains[0] = new String[] { "0", "1" }; } // Score the dataset, building the class distribution & predictions BigScore bs = makeBigScoreTask(domains, names, adaptFrm, computeMetrics, true, j, customMetricFunc) .doAll(names.length, Vec.T_NUM, adaptFrm); if (computeMetrics) bs._mb.makeModelMetrics(this, fr, adaptFrm, bs.outputFrame()); Frame predictFr = bs.outputFrame(Key.<Frame>make(destination_key), names, domains); return postProcessPredictions(adaptFrm, predictFr, j); } /** * Post-process prediction frame. * * @param adaptFrm * @param predictFr * @return */ protected Frame postProcessPredictions(Frame adaptFrm, Frame predictFr, Job j) { return predictFr; } /** Score an already adapted frame. Returns a MetricBuilder that can be used to make a model metrics. * @param adaptFrm Already adapted frame * @return MetricBuilder */ protected ModelMetrics.MetricBuilder scoreMetrics(Frame adaptFrm) { final boolean computeMetrics = (!isSupervised() || (adaptFrm.vec(_output.responseName()) != null && !adaptFrm.vec(_output.responseName()).isBad())); // Build up the names & domains. //String[] names = makeScoringNames(); String[][] domains = new String[1][]; domains[0] = _output.nclasses() == 1 ? null : !computeMetrics ? _output._domains[_output._domains.length - 1] : adaptFrm.lastVec().domain(); if (domains[0] == null && _parms._distribution == DistributionFamily.quasibinomial) { domains[0] = new String[] { "0", "1" }; } // Score the dataset, building the class distribution & predictions BigScore bs = makeBigScoreTask(domains, null, adaptFrm, computeMetrics, false, null, CFuncRef.from(_parms._custom_metric_func)).doAll(adaptFrm); return bs._mb; } protected class BigScore extends CMetricScoringTask<BigScore> { final protected String[] _domain; // Prediction domain; union of test and train classes final protected int _npredcols; // Number of columns in prediction; nclasses+1 - can be less than the prediction domain final double[] _mean; // Column means of test frame final public boolean _computeMetrics; // Column means of test frame final public boolean _hasWeights; final public boolean _makePreds; final public Job _j; /** Output parameter: Metric builder */ public ModelMetrics.MetricBuilder _mb; public BigScore(String[] domain, int ncols, double[] mean, boolean testHasWeights, boolean computeMetrics, boolean makePreds, Job j, CFuncRef customMetricFunc) { super(customMetricFunc); _j = j; _domain = domain; _npredcols = ncols; _mean = mean; _computeMetrics = computeMetrics; _makePreds = makePreds; if (_output._hasWeights && _computeMetrics && !testHasWeights) throw new IllegalArgumentException("Missing weights when computing validation metrics."); _hasWeights = testHasWeights; } @Override public void map(Chunk chks[], NewChunk cpreds[]) { if (isCancelled() || _j != null && _j.stop_requested()) return; Chunk weightsChunk = _hasWeights && _computeMetrics ? chks[_output.weightsIdx()] : null; Chunk offsetChunk = _output.hasOffset() ? chks[_output.offsetIdx()] : null; Chunk responseChunk = null; float[] actual = null; _mb = Model.this.makeMetricBuilder(_domain); if (_computeMetrics) { if (isSupervised()) { actual = new float[1]; responseChunk = chks[_output.responseIdx()]; } else actual = new float[chks.length]; } int len = chks[0]._len; try { setupBigScorePredict(); if (!bulkBigScorePredict()) { double[] tmp = new double[_output.nfeatures()]; double[] preds = _mb._work; // Sized for the union of test and train classes for (int row = 0; row < len; row++) { double weight = weightsChunk != null ? weightsChunk.atd(row) : 1; if (weight == 0) { if (_makePreds) { for (int c = 0; c < _npredcols; c++) // Output predictions; sized for train only (excludes extra test classes) cpreds[c].addNum(0); } continue; } double offset = offsetChunk != null ? offsetChunk.atd(row) : 0; double[] p = score0(chks, offset, row, tmp, preds); if (_computeMetrics) { if (isSupervised()) { actual[0] = (float) responseChunk.atd(row); } else { for (int i = 0; i < actual.length; ++i) actual[i] = (float) data(chks, row, i); } _mb.perRow(preds, actual, weight, offset, Model.this); // Handle custom metric customMetricPerRow(preds, actual, weight, offset, Model.this); } if (_makePreds) { for (int c = 0; c < _npredcols; c++) // Output predictions; sized for train only (excludes extra test classes) cpreds[c].addNum(p[c]); } } } else { int[] indices = new int[len]; double[] offsets = offsetChunk != null ? new double[len] : null; int nonZeroW = 0; for (int row = 0; row < len; row++) { double weight = getWeight(weightsChunk, row); if (weight == 0) { if (_makePreds) { for (int c = 0; c < _npredcols; c++) // Output predictions; sized for train only (excludes extra test classes) cpreds[c].addNum(0); } continue; } if (offsetChunk != null) { offsets[nonZeroW] = getOffset(offsetChunk, row); } indices[nonZeroW++] = row; } indices = Arrays.copyOf(indices, nonZeroW); if (0 == nonZeroW) { return; } double[][] bulkPreds = new double[nonZeroW][]; for (int i = 0; i < bulkPreds.length; i++) { bulkPreds[i] = new double[_mb._work.length]; } double[][] bulkTmp = new double[nonZeroW][]; for (int i = 0; i < bulkTmp.length; i++) { bulkTmp[i] = new double[_output.nfeatures()]; } double[][] p = score0(chks, offsets, indices, bulkTmp, bulkPreds); for (int rowIdx = 0; rowIdx < indices.length; rowIdx++) { int row = indices[rowIdx]; if (_computeMetrics) { if (isSupervised()) { actual[0] = (float) responseChunk.atd(row); } else { for (int i = 0; i < actual.length; ++i) actual[i] = (float) data(chks, row, i); } _mb.perRow(bulkPreds[rowIdx], actual, getWeight(weightsChunk, row), getOffset(offsetChunk, row), Model.this); } if (_makePreds) { for (int c = 0; c < _npredcols; c++) // Output predictions; sized for train only (excludes extra test classes) cpreds[c].addNum(p[rowIdx][c]); } } } } finally { closeBigScorePredict(); } } @Override public void reduce(BigScore bs) { super.reduce(bs); if (_mb != null) _mb.reduce(bs._mb); } @Override protected void postGlobal() { super.postGlobal(); if (_mb != null) { _mb.postGlobal(getComputedCustomMetric()); } } private double getOffset(Chunk offsetChunk, int row) { return offsetChunk != null ? offsetChunk.atd(row) : 0; } private double getWeight(Chunk weightsChunk, int row) { return weightsChunk != null ? weightsChunk.atd(row) : 1; } } protected boolean bulkBigScorePredict() { return false; } protected void setupBigScorePredict() { } protected void closeBigScorePredict() { } // OVerride this if your model needs data preprocessing (on the fly standardization, NA handling) protected double data(Chunk[] chks, int row, int col) { return chks[col].atd(row); } /** Bulk scoring API for one row. Chunks are all compatible with the model, * and expect the last Chunks are for the final distribution and prediction. * Default method is to just load the data into the tmp array, then call * subclass scoring logic. */ public double[] score0(Chunk chks[], int row_in_chunk, double[] tmp, double[] preds) { return score0(chks, 0, row_in_chunk, tmp, preds); } // To be implemented by Models that override bulkBigScorePredict() to return true public double[][] score0(Chunk chks[], double[] offset, int[] rowsInChunk, double[][] tmp, double[][] preds) { throw new IllegalStateException("Not implemented."); } public double[] score0(Chunk chks[], double offset, int row_in_chunk, double[] tmp, double[] preds) { assert (_output.nfeatures() == tmp.length); for (int i = 0; i < tmp.length; i++) tmp[i] = chks[i].atd(row_in_chunk); double[] scored = score0(tmp, preds, offset); if (isSupervised()) score0PostProcessSupervised(scored, tmp); return scored; } protected final void score0PostProcessSupervised(double[] scored, double[] tmp) { // Correct probabilities obtained from training on oversampled data back to original distribution // C.f. http://gking.harvard.edu/files/0s.pdf Eq.(27) if (_output.isClassifier()) { if (_parms._balance_classes) GenModel.correctProbabilities(scored, _output._priorClassDist, _output._modelClassDist); //assign label at the very end (after potentially correcting probabilities) scored[0] = hex.genmodel.GenModel.getPrediction(scored, _output._priorClassDist, tmp, defaultThreshold()); } } /** Subclasses implement the scoring logic. The data is pre-loaded into a * re-used temp array, in the order the model expects. The predictions are * loaded into the re-used temp array, which is also returned. */ protected abstract double[] score0(double data[/*ncols*/], double preds[/*nclasses+1*/]); /**Override scoring logic for models that handle weight/offset**/ protected double[] score0(double data[/*ncols*/], double preds[/*nclasses+1*/], double offset) { assert (offset == 0) : "Override this method for non-trivial offset!"; return score0(data, preds); } // Version where the user has just ponied-up an array of data to be scored. // Data must be in proper order. Handy for JUnit tests. public double score(double[] data) { return ArrayUtils.maxIndex(score0(data, new double[_output.nclasses()])); } @Override protected Futures remove_impl(Futures fs) { if (_output._model_metrics != null) for (Key k : _output._model_metrics) k.remove(fs); cleanUp(_toDelete); return super.remove_impl(fs); } /** Write out K/V pairs, in this case model metrics. */ @Override protected AutoBuffer writeAll_impl(AutoBuffer ab) { if (_output._model_metrics != null) for (Key k : _output._model_metrics) ab.putKey(k); return super.writeAll_impl(ab); } @Override protected Keyed readAll_impl(AutoBuffer ab, Futures fs) { if (_output._model_metrics != null) for (Key k : _output._model_metrics) ab.getKey(k, fs); // Load model metrics return super.readAll_impl(ab, fs); } @Override protected long checksum_impl() { return _parms.checksum_impl() * _output.checksum_impl(); } /** * Override this in models that support serialization into the MOJO format. * @return a class that inherits from ModelMojoWriter */ public ModelMojoWriter getMojo() { throw H2O.unimpl("MOJO format is not available for " + _parms.fullName() + " models."); } // ========================================================================== /** Return a String which is a valid Java program representing a class that * implements the Model. The Java is of the form: * <pre> * class UUIDxxxxModel { * public static final String NAMES[] = { ....column names... } * public static final String DOMAINS[][] = { ....domain names... } * // Pass in data in a double[], pre-aligned to the Model's requirements. * // Jam predictions into the preds[] array; preds[0] is reserved for the * // main prediction (class for classifiers or value for regression), * // and remaining columns hold a probability distribution for classifiers. * double[] predict( double data[], double preds[] ); * double[] map( HashMap < String,Double > row, double data[] ); * // Does the mapping lookup for every row, no allocation * double[] predict( HashMap < String,Double > row, double data[], double preds[] ); * // Allocates a double[] for every row * double[] predict( HashMap < String,Double > row, double preds[] ); * // Allocates a double[] and a double[] for every row * double[] predict( HashMap < String,Double > row ); * } * </pre> */ public final String toJava(boolean preview, boolean verboseCode) { // 32k buffer by default ByteArrayOutputStream os = new ByteArrayOutputStream(Short.MAX_VALUE); // We do not need to close BAOS /* ignore returned stream */ toJava(os, preview, verboseCode); return os.toString(); } public final SBPrintStream toJava(OutputStream os, boolean preview, boolean verboseCode) { if (preview /* && toJavaCheckTooBig() */) { os = new LineLimitOutputStreamWrapper(os, 1000); } return toJava(new SBPrintStream(os), preview, verboseCode); } protected SBPrintStream toJava(SBPrintStream sb, boolean isGeneratingPreview, boolean verboseCode) { CodeGeneratorPipeline fileCtx = new CodeGeneratorPipeline(); // preserve file context String modelName = JCodeGen.toJavaId(_key.toString()); // HEADER sb.p("/*").nl(); sb.p(" Licensed under the Apache License, Version 2.0").nl(); sb.p(" http://www.apache.org/licenses/LICENSE-2.0.html").nl(); sb.nl(); sb.p(" AUTOGENERATED BY H2O at ").p(new DateTime().toString()).nl(); sb.p(" ").p(H2O.ABV.projectVersion()).nl(); sb.p(" ").nl(); sb.p(" Standalone prediction code with sample test data for ").p(this.getClass().getSimpleName()) .p(" named ").p(modelName).nl(); sb.nl(); sb.p(" How to download, compile and execute:").nl(); sb.p(" mkdir tmpdir").nl(); sb.p(" cd tmpdir").nl(); sb.p(" curl http:/").p(H2O.SELF.toString()).p("/3/h2o-genmodel.jar > h2o-genmodel.jar").nl(); sb.p(" curl http:/").p(H2O.SELF.toString()).p("/3/Models.java/").pobj(_key).p(" > ").p(modelName) .p(".java").nl(); sb.p(" javac -cp h2o-genmodel.jar -J-Xmx2g -J-XX:MaxPermSize=128m ").p(modelName).p(".java").nl(); // Intentionally disabled since there is no main method in generated code // sb.p("// java -cp h2o-genmodel.jar:. -Xmx2g -XX:MaxPermSize=256m -XX:ReservedCodeCacheSize=256m ").p(modelName).nl(); sb.nl(); sb.p(" (Note: Try java argument -XX:+PrintCompilation to show runtime JIT compiler behavior.)").nl(); if (_parms._offset_column != null) { sb.nl(); sb.nl(); sb.nl(); sb.p(" NOTE: Java model export does not support offset_column.").nl(); sb.nl(); Log.warn("Java model export does not support offset_column."); } if (isGeneratingPreview && toJavaCheckTooBig()) { sb.nl(); sb.nl(); sb.nl(); sb.p(" NOTE: Java model is too large to preview, please download as shown above.").nl(); sb.nl(); return sb; } sb.p("*/").nl(); sb.p("import java.util.Map;").nl(); sb.p("import hex.genmodel.GenModel;").nl(); sb.p("import hex.genmodel.annotations.ModelPojo;").nl(); for (Class<?> clz : getPojoInterfaces()) sb.p("import ").p(clz.getName()).p(";").nl(); sb.nl(); String algo = this.getClass().getSimpleName().toLowerCase().replace("model", ""); sb.p("@ModelPojo(name=\"").p(modelName).p("\", algorithm=\"").p(algo).p("\")").nl(); sb.p("public class ").p(modelName).p(" extends GenModel ").p(makeImplementsClause()).p("{").nl().ii(1); sb.ip("public hex.ModelCategory getModelCategory() { return hex.ModelCategory." + _output.getModelCategory() + "; }").nl(); toJavaInit(sb, fileCtx).nl(); toJavaNAMES(sb, fileCtx); toJavaNCLASSES(sb); toJavaDOMAINS(sb, fileCtx); toJavaPROB(sb); toJavaSuper(modelName, sb); // sb.p(" public String getUUID() { return Long.toString(" + checksum() + "L); }").nl(); toJavaPredict(sb, fileCtx, verboseCode); toJavaTransform(sb, fileCtx, verboseCode); sb.p("}").nl().di(1); fileCtx.generate(sb); // Append file context sb.nl(); return sb; } private SB makeImplementsClause() { SB sb = new SB(); Class<?>[] interfaces = getPojoInterfaces(); if (interfaces.length == 0) return sb; sb.p("implements "); for (int i = 0; i < interfaces.length - 1; i++) sb.p(interfaces[i].getSimpleName()).p(", "); sb.p(interfaces[interfaces.length - 1].getSimpleName()).p(' '); return sb; } protected Class<?>[] getPojoInterfaces() { return new Class<?>[0]; } /** Generate implementation for super class. */ protected SBPrintStream toJavaSuper(String modelName, SBPrintStream sb) { String responseName = isSupervised() ? '"' + _output.responseName() + '"' : null; return sb.nl().ip("public " + modelName + "() { super(NAMES,DOMAINS," + responseName + "); }").nl(); } private SBPrintStream toJavaNAMES(SBPrintStream sb, CodeGeneratorPipeline fileCtx) { final String modelName = JCodeGen.toJavaId(_key.toString()); final String namesHolderClassName = "NamesHolder_" + modelName; sb.i().p("// ").p("Names of columns used by model.").nl(); sb.i().p("public static final String[] NAMES = " + namesHolderClassName + ".VALUES;").nl(); // Generate class which fills the names into array fileCtx.add(new CodeGenerator() { @Override public void generate(JCodeSB out) { out.i().p("// The class representing training column names").nl(); JCodeGen.toClassWithArray(out, null, namesHolderClassName, Arrays.copyOf(_output._names, _output.nfeatures())); } }); return sb; } protected SBPrintStream toJavaNCLASSES(SBPrintStream sb) { return _output.isClassifier() ? JCodeGen.toStaticVar(sb, "NCLASSES", _output.nclasses(), "Number of output classes included in training data response column.") : sb; } private SBPrintStream toJavaDOMAINS(SBPrintStream sb, CodeGeneratorPipeline fileCtx) { String modelName = JCodeGen.toJavaId(_key.toString()); sb.nl(); sb.ip("// Column domains. The last array contains domain of response column.").nl(); sb.ip("public static final String[][] DOMAINS = new String[][] {").nl(); String[][] domains = scoringDomains(); for (int i = 0; i < domains.length; i++) { final int idx = i; final String[] dom = domains[i]; final String colInfoClazz = modelName + "_ColInfo_" + i; sb.i(1).p("/* ").p(_output._names[i]).p(" */ "); if (dom != null) sb.p(colInfoClazz).p(".VALUES"); else sb.p("null"); if (i != domains.length - 1) sb.p(','); sb.nl(); // Right now do not generate the class representing column // since it does not hold any interesting information except String array holding domain if (dom != null) { fileCtx.add(new CodeGenerator() { @Override public void generate(JCodeSB out) { out.ip("// The class representing column ").p(_output._names[idx]).nl(); JCodeGen.toClassWithArray(out, null, colInfoClazz, dom); } }); } } return sb.ip("};").nl(); } protected SBPrintStream toJavaPROB(SBPrintStream sb) { if (isSupervised()) { JCodeGen.toStaticVar(sb, "PRIOR_CLASS_DISTRIB", _output._priorClassDist, "Prior class distribution"); JCodeGen.toStaticVar(sb, "MODEL_CLASS_DISTRIB", _output._modelClassDist, "Class distribution used for model building"); } return sb; } protected boolean toJavaCheckTooBig() { Log.warn("toJavaCheckTooBig must be overridden for this model type to render it in the browser"); return true; } // Override in subclasses to provide some top-level model-specific goodness protected SBPrintStream toJavaInit(SBPrintStream sb, CodeGeneratorPipeline fileContext) { return sb; } // Override in subclasses to provide some inside 'predict' call goodness // Method returns code which should be appended into generated top level class after // predict method. protected void toJavaPredictBody(SBPrintStream body, CodeGeneratorPipeline classCtx, CodeGeneratorPipeline fileCtx, boolean verboseCode) { throw new UnsupportedOperationException("This model type does not support conversion to Java"); } // Wrapper around the main predict call, including the signature and return value private SBPrintStream toJavaPredict(SBPrintStream ccsb, CodeGeneratorPipeline fileCtx, boolean verboseCode) { // ccsb = classContext ccsb.nl(); ccsb.ip("// Pass in data in a double[], pre-aligned to the Model's requirements.").nl(); ccsb.ip("// Jam predictions into the preds[] array; preds[0] is reserved for the").nl(); ccsb.ip("// main prediction (class for classifiers or value for regression),").nl(); ccsb.ip("// and remaining columns hold a probability distribution for classifiers.").nl(); ccsb.ip("public final double[] score0( double[] data, double[] preds ) {").nl(); CodeGeneratorPipeline classCtx = new CodeGeneratorPipeline(); //new SB().ii(1); toJavaPredictBody(ccsb.ii(1), classCtx, fileCtx, verboseCode); ccsb.ip("return preds;").nl(); ccsb.di(1).ip("}").nl(); // Output class context classCtx.generate(ccsb.ii(1)); ccsb.di(1); return ccsb; } // Generates optional "transform" method, transform method will have a different signature depending on the algo // Empty by default - can be overriden by Model implementation protected SBPrintStream toJavaTransform(SBPrintStream ccsb, CodeGeneratorPipeline fileCtx, boolean verboseCode) { // ccsb = classContext return ccsb; } // Convenience method for testing: build Java, convert it to a class & // execute it: compare the results of the new class's (JIT'd) scoring with // the built-in (interpreted) scoring on this dataset. Returns true if all // is well, false is there are any mismatches. Throws if there is any error // (typically an AssertionError or unable to compile the POJO). public boolean testJavaScoring(Frame data, Frame model_predictions, double rel_epsilon) { return testJavaScoring(data, model_predictions, rel_epsilon, 1e-15, 0.1); } public boolean testJavaScoring(Frame data, Frame model_predictions, double rel_epsilon, double abs_epsilon) { return testJavaScoring(data, model_predictions, rel_epsilon, abs_epsilon, 0.1); } public boolean testJavaScoring(Frame data, Frame model_predictions, double rel_epsilon, double abs_epsilon, double fraction) { return testJavaScoring(data, model_predictions, new EasyPredictModelWrapper.Config(), rel_epsilon, abs_epsilon, fraction); } public boolean testJavaScoring(Frame data, Frame model_predictions, EasyPredictModelWrapper.Config config, double rel_epsilon, double abs_epsilon, double fraction) { ModelBuilder mb = ModelBuilder.make(_parms.algoName().toLowerCase(), null, null); boolean havePojo = mb.havePojo(); boolean haveMojo = mb.haveMojo(); Random rnd = RandomUtils.getRNG(data.byteSize()); assert data.numRows() == model_predictions.numRows(); Frame fr = new Frame(data); boolean computeMetrics = data.vec(_output.responseName()) != null && !data.vec(_output.responseName()).isBad(); try { String[] warns = adaptTestForTrain(fr, true, computeMetrics); if (warns.length > 0) System.err.println(Arrays.toString(warns)); // Output is in the model's domain, but needs to be mapped to the scored // dataset's domain. int[] omap = null; if (_output.isClassifier()) { Vec actual = fr.vec(_output.responseName()); String[] sdomain = actual == null ? null : actual.domain(); // Scored/test domain; can be null String[] mdomain = model_predictions.vec(0).domain(); // Domain of predictions (union of test and train) if (sdomain != null && !Arrays.equals(mdomain, sdomain)) { omap = CategoricalWrappedVec.computeMap(mdomain, sdomain); // Map from model-domain to scoring-domain } } String modelName = JCodeGen.toJavaId(_key.toString()); boolean preview = false; GenModel genmodel = null; Vec[] dvecs = fr.vecs(); Vec[] pvecs = model_predictions.vecs(); double[] features = null; int num_errors = 0; int num_total = 0; // First try internal POJO via fast double[] API if (havePojo) { try { String java_text = toJava(preview, true); Class clz = JCodeGen.compile(modelName, java_text); genmodel = (GenModel) clz.newInstance(); } catch (IllegalArgumentException e) { e.printStackTrace(); return true; } catch (Exception e) { e.printStackTrace(); throw H2O.fail("Internal POJO compilation failed", e); } // Check that POJO has the expected interfaces for (Class<?> clz : getPojoInterfaces()) if (!clz.isInstance(genmodel)) throw new IllegalStateException("POJO is expected to implement interface " + clz.getName()); // Check some model metadata assert _output.responseName() == null || _output.responseName().equals(genmodel.getResponseName()); features = MemoryManager.malloc8d(genmodel.nfeatures()); double[] predictions = MemoryManager.malloc8d(genmodel.nclasses() + 1); // Compare predictions, counting mis-predicts for (int row = 0; row < fr.numRows(); row++) { // For all rows, single-threaded if (rnd.nextDouble() >= fraction) continue; num_total++; // Native Java API for (int col = 0; col < features.length; col++) // Build feature set features[col] = dvecs[col].at(row); genmodel.score0(features, predictions); // POJO predictions for (int col = _output.isClassifier() ? 1 : 0; col < pvecs.length; col++) { // Compare predictions double d = pvecs[col].at(row); // Load internal scoring predictions if (col == 0 && omap != null) d = omap[(int) d]; // map categorical response to scoring domain if (!MathUtils.compare(predictions[col], d, abs_epsilon, rel_epsilon)) { if (num_errors++ < 10) System.err.println("Predictions mismatch, row " + row + ", col " + model_predictions._names[col] + ", internal prediction=" + d + ", POJO prediction=" + predictions[col]); break; } } } } // EasyPredict API with POJO and/or MOJO for (int i = 0; i < 2; ++i) { if (i == 0 && !havePojo) continue; if (i == 1 && !haveMojo) continue; if (i == 1) { // MOJO final String filename = modelName + ".zip"; StreamingSchema ss = new StreamingSchema(getMojo(), filename); try { FileOutputStream os = new FileOutputStream(ss.getFilename()); ss.getStreamWriter().writeTo(os); os.close(); genmodel = MojoModel.load(filename); features = MemoryManager.malloc8d(genmodel._names.length); } catch (IOException e1) { e1.printStackTrace(); throw H2O.fail("Internal MOJO loading failed", e1); } finally { boolean deleted = new File(filename).delete(); if (!deleted) Log.warn("Failed to delete the file"); } } EasyPredictModelWrapper epmw = new EasyPredictModelWrapper( config.setModel(genmodel).setConvertUnknownCategoricalLevelsToNa(true)); RowData rowData = new RowData(); BufferedString bStr = new BufferedString(); for (int row = 0; row < fr.numRows(); row++) { // For all rows, single-threaded if (rnd.nextDouble() >= fraction) continue; // Generate input row for (int col = 0; col < features.length; col++) { if (dvecs[col].isString()) { rowData.put(genmodel._names[col], dvecs[col].atStr(bStr, row).toString()); } else { double val = dvecs[col].at(row); rowData.put(genmodel._names[col], genmodel._domains[col] == null ? (Double) val : Double.isNaN(val) ? val // missing categorical values are kept as NaN, the score0 logic passes it on to bitSetContains() : (int) val < genmodel._domains[col].length ? genmodel._domains[col][(int) val] : "UnknownLevel"); //unseen levels are treated as such } } // Make a prediction AbstractPrediction p; try { p = epmw.predict(rowData); } catch (PredictException e) { num_errors++; if (num_errors < 20) { System.err.println("EasyPredict threw an exception when predicting row " + rowData); e.printStackTrace(); } continue; } // Convert model predictions and "internal" predictions into the same shape double[] expected_preds = new double[pvecs.length]; double[] actual_preds = new double[pvecs.length]; for (int col = 0; col < pvecs.length; col++) { // Compare predictions double d = pvecs[col].at(row); // Load internal scoring predictions if (col == 0 && omap != null) d = omap[(int) d]; // map categorical response to scoring domain double d2 = Double.NaN; switch (genmodel.getModelCategory()) { case AutoEncoder: d2 = ((AutoEncoderModelPrediction) p).reconstructed[col]; break; case Clustering: d2 = ((ClusteringModelPrediction) p).cluster; break; case Regression: d2 = ((RegressionModelPrediction) p).value; break; case Binomial: BinomialModelPrediction bmp = (BinomialModelPrediction) p; d2 = (col == 0) ? bmp.labelIndex : bmp.classProbabilities[col - 1]; break; case Multinomial: MultinomialModelPrediction mmp = (MultinomialModelPrediction) p; d2 = (col == 0) ? mmp.labelIndex : mmp.classProbabilities[col - 1]; break; case DimReduction: d2 = ((DimReductionModelPrediction) p).dimensions[col]; break; } expected_preds[col] = d; actual_preds[col] = d2; } // Verify the correctness of the prediction num_total++; for (int col = genmodel.isClassifier() ? 1 : 0; col < pvecs.length; col++) { if (!MathUtils.compare(actual_preds[col], expected_preds[col], abs_epsilon, rel_epsilon)) { num_errors++; if (num_errors < 20) { System.err.println((i == 0 ? "POJO" : "MOJO") + " EasyPredict Predictions mismatch for row " + row + ":" + rowData); System.err.println(" Expected predictions: " + Arrays.toString(expected_preds)); System.err.println(" Actual predictions: " + Arrays.toString(actual_preds)); System.err .println("Difference: " + Math.abs(expected_preds[expected_preds.length - 1] - actual_preds[actual_preds.length - 1])); } break; } } } } if (num_errors != 0) System.err.println( "Number of errors: " + num_errors + (num_errors > 20 ? " (only first 20 are shown)" : "") + " out of " + num_total + " rows tested."); return num_errors == 0; } finally { Frame.deleteTempFrameAndItsNonSharedVecs(fr, data); // Remove temp keys. } } public void deleteCrossValidationModels() { if (_output._cross_validation_models != null) { for (Key k : _output._cross_validation_models) { Model m = DKV.getGet(k); if (m != null) m.delete(); //delete all subparts } } } @Override public String toString() { return _output.toString(); } /** Model stream writer - output Java code representation of model. */ public class JavaModelStreamWriter extends StreamWriter { /** Show only preview */ private final boolean preview; public JavaModelStreamWriter(boolean preview) { this.preview = preview; } @Override public void writeTo(OutputStream os) { toJava(os, preview, true); } } @Override public Class<KeyV3.ModelKeyV3> makeSchema() { return KeyV3.ModelKeyV3.class; } public static Frame makeInteractions(Frame fr, boolean valid, InteractionPair[] interactions, boolean useAllFactorLevels, boolean skipMissing, boolean standardize) { Vec anyTrainVec = fr.anyVec(); Vec[] interactionVecs = new Vec[interactions.length]; String[] interactionNames = new String[interactions.length]; int idx = 0; for (InteractionPair ip : interactions) { interactionNames[idx] = fr.name(ip._v1) + "_" + fr.name(ip._v2); InteractionWrappedVec iwv = new InteractionWrappedVec(anyTrainVec.group().addVec(), anyTrainVec._rowLayout, ip._v1Enums, ip._v2Enums, useAllFactorLevels, skipMissing, standardize, fr.vec(ip._v1)._key, fr.vec(ip._v2)._key); interactionVecs[idx++] = iwv; } return new Frame(interactionNames, interactionVecs); } public static InteractionWrappedVec[] makeInteractions(Frame fr, InteractionPair[] interactions, boolean useAllFactorLevels, boolean skipMissing, boolean standardize) { Vec anyTrainVec = fr.anyVec(); InteractionWrappedVec[] interactionVecs = new InteractionWrappedVec[interactions.length]; int idx = 0; for (InteractionPair ip : interactions) interactionVecs[idx++] = new InteractionWrappedVec(anyTrainVec.group().addVec(), anyTrainVec._rowLayout, ip._v1Enums, ip._v2Enums, useAllFactorLevels, skipMissing, standardize, fr.vec(ip._v1)._key, fr.vec(ip._v2)._key); return interactionVecs; } public static InteractionWrappedVec makeInteraction(Frame fr, InteractionPair ip, boolean useAllFactorLevels, boolean skipMissing, boolean standardize) { Vec anyVec = fr.anyVec(); return new InteractionWrappedVec(anyVec.group().addVec(), anyVec._rowLayout, ip._v1Enums, ip._v2Enums, useAllFactorLevels, skipMissing, standardize, fr.vec(ip._v1)._key, fr.vec(ip._v2)._key); } /** * This class represents a pair of interacting columns plus some additional data * about specific enums to be interacted when the vecs are categorical. The question * naturally arises why not just use something like an ArrayList of int[2] (as is done, * for example, in the Interaction/CreateInteraction classes) and the answer essentially * boils down a desire to specify these specific levels. * * Another difference with the CreateInteractions class: * 1. do not interact on NA (someLvl_NA and NA_somLvl are actual NAs) * this does not appear here, but in the InteractionWrappedVec class * TODO: refactor the CreateInteractions to be useful here and in InteractionWrappedVec */ public static class InteractionPair extends Iced<InteractionPair> { public int vecIdx; private int _v1, _v2; private String[] _v1Enums; private String[] _v2Enums; private int _hash; private InteractionPair() { } private InteractionPair(int v1, int v2, String[] v1Enums, String[] v2Enums) { _v1 = v1; _v2 = v2; _v1Enums = v1Enums; _v2Enums = v2Enums; // hash is column ints; Item 9 p.47 of Effective Java _hash = 17; _hash = 31 * _hash + _v1; _hash = 31 * _hash + _v2; if (_v1Enums == null) _hash = 31 * _hash; else for (String s : _v1Enums) _hash = 31 * _hash + s.hashCode(); if (_v2Enums == null) _hash = 31 * _hash; else for (String s : _v2Enums) _hash = 31 * _hash + s.hashCode(); } /** * Generate all pairwise combinations of ints in the range [from,to). * @param from Start index * @param to End index (exclusive) * @return An array of interaction pairs. */ public static InteractionPair[] generatePairwiseInteractions(int from, int to) { if (1 == (to - from)) throw new IllegalArgumentException( "Illegal range of values, must be greater than a single value. Got: " + from + "<" + to); InteractionPair[] res = new InteractionPair[((to - from - 1) * (to - from)) >> 1]; // n*(n+1) / 2 int idx = 0; for (int i = from; i < to; ++i) for (int j = i + 1; j < to; ++j) res[idx++] = new InteractionPair(i, j, null, null); return res; } /** * Generate all pairwise combinations of the arguments. * @param indexes An array of column indices. * @return An array of interaction pairs */ public static InteractionPair[] generatePairwiseInteractionsFromList(int... indexes) { if (null == indexes) return null; if (indexes.length < 2) { if (indexes.length == 1 && indexes[0] == -1) return null; throw new IllegalArgumentException("Must supply 2 or more columns."); } InteractionPair[] res = new InteractionPair[(indexes.length - 1) * (indexes.length) >> 1]; // n*(n+1) / 2 int idx = 0; for (int i = 0; i < indexes.length; ++i) for (int j = i + 1; j < indexes.length; ++j) res[idx++] = new InteractionPair(indexes[i], indexes[j], null, null); return res; } @Override public int hashCode() { return _hash; } @Override public String toString() { return _v1 + (_v1Enums == null ? "" : Arrays.toString(_v1Enums)) + ":" + _v2 + (_v2Enums == null ? "" : Arrays.toString(_v2Enums)); } @Override public boolean equals(Object o) { boolean res = o instanceof InteractionPair; if (res) { InteractionPair ip = (InteractionPair) o; return (_v1 == ip._v1) && (_v2 == ip._v2) && Arrays.equals(_v1Enums, ip._v1Enums) && Arrays.equals(_v2Enums, ip._v2Enums); } return false; } } /** * Imports a binary model from a given location. * Note: binary model has to be created by the same version of H2O, import of a model from a different version will fail * @param location path to the binary representation of the model on a local filesystem, HDFS, S3... * @return instance of an H2O Model * @throws IOException when reading fails */ public static <M extends Model<?, ?, ?>> M importBinaryModel(String location) throws IOException { InputStream is = null; try { URI targetUri = FileUtils.getURI(location); Persist p = H2O.getPM().getPersistForURI(targetUri); is = p.open(targetUri.toString()); final AutoBuffer ab = new AutoBuffer(is); ab.sourceName = targetUri.toString(); @SuppressWarnings("unchecked") M model = (M) Keyed.readAll(ab); ab.close(); is.close(); return model; } finally { FileUtils.close(is); } } /** * Exports a binary model to a given location. * @param location target path, it can be on local filesystem, HDFS, S3... * @param force If true, overwrite already existing file * @return URI representation of the target location * @throws IOException when writing fails */ public URI exportBinaryModel(String location, boolean force) throws IOException { OutputStream os = null; try { URI targetUri = FileUtils.getURI(location); Persist p = H2O.getPM().getPersistForURI(targetUri); os = p.create(targetUri.toString(), force); this.writeAll(new AutoBuffer(os, true)).close(); os.close(); return targetUri; } finally { FileUtils.close(os); } } /** * Exports a MOJO representation of a model to a given location. * @param location target path, it can be on local filesystem, HDFS, S3... * @param force If true, overwrite already existing file * @return URI representation of the target location * @throws IOException when writing fails */ public URI exportMojo(String location, boolean force) throws IOException { OutputStream os = null; try { URI targetUri = FileUtils.getURI(location); Persist p = H2O.getPM().getPersistForURI(targetUri); os = p.create(targetUri.toString(), force); this.writeAll(new AutoBuffer(os, true)).close(); os.close(); return targetUri; } finally { FileUtils.close(os); } } }