hivemall.classifier.multiclass.MulticlassOnlineClassifierUDTF.java Source code

Java tutorial

Introduction

Here is the source code for hivemall.classifier.multiclass.MulticlassOnlineClassifierUDTF.java

Source

/*
 * Hivemall: Hive scalable Machine Learning Library
 *
 * Copyright (C) 2015 Makoto YUI
 * Copyright (C) 2013-2015 National Institute of Advanced Industrial Science and Technology (AIST)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package hivemall.classifier.multiclass;

import static hivemall.HivemallConstants.BIGINT_TYPE_NAME;
import static hivemall.HivemallConstants.INT_TYPE_NAME;
import static hivemall.HivemallConstants.STRING_TYPE_NAME;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.writableFloatObjectInspector;
import hivemall.LearnerBaseUDTF;
import hivemall.model.FeatureValue;
import hivemall.model.IWeightValue;
import hivemall.model.Margin;
import hivemall.model.PredictionModel;
import hivemall.model.PredictionResult;
import hivemall.model.WeightValue;
import hivemall.model.WeightValue.WeightValueWithCovar;
import hivemall.utils.collections.IMapIterator;
import hivemall.utils.datetime.StopWatch;
import hivemall.utils.hadoop.HadoopUtils;
import hivemall.utils.hadoop.HiveUtils;
import hivemall.utils.io.IOUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableFloatObjectInspector;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;

public abstract class MulticlassOnlineClassifierUDTF extends LearnerBaseUDTF {
    private static final Log logger = LogFactory.getLog(MulticlassOnlineClassifierUDTF.class);

    private ListObjectInspector featureListOI;
    private boolean parseFeature;
    private PrimitiveObjectInspector labelInputOI;

    protected Map<Object, PredictionModel> label2model;
    protected int count;

    @Override
    public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
        if (argOIs.length < 2) {
            throw new UDFArgumentException(getClass().getSimpleName()
                    + " takes 2 arguments: List<Int|BigInt|Text> features, {Int|BitInt|Text} label [, constant text options]");
        }
        PrimitiveObjectInspector featureInputOI = processFeaturesOI(argOIs[0]);
        this.labelInputOI = HiveUtils.asPrimitiveObjectInspector(argOIs[1]);
        String labelTypeName = labelInputOI.getTypeName();
        if (!STRING_TYPE_NAME.equals(labelTypeName) && !INT_TYPE_NAME.equals(labelTypeName)
                && !BIGINT_TYPE_NAME.equals(labelTypeName)) {
            throw new UDFArgumentTypeException(0, "label must be a type [Int|BigInt|Text]: " + labelTypeName);
        }

        processOptions(argOIs);

        PrimitiveObjectInspector featureOutputOI = dense_model
                ? PrimitiveObjectInspectorFactory.javaIntObjectInspector
                : featureInputOI;
        this.label2model = new HashMap<Object, PredictionModel>(64);
        if (preloadedModelFile != null) {
            loadPredictionModel(label2model, preloadedModelFile, labelInputOI, featureOutputOI);
        }

        this.count = 0;
        return getReturnOI(labelInputOI, featureOutputOI);
    }

    @Override
    protected int getInitialModelSize() {
        return 8192;
    }

    protected PrimitiveObjectInspector processFeaturesOI(ObjectInspector arg) throws UDFArgumentException {
        this.featureListOI = (ListObjectInspector) arg;
        ObjectInspector featureRawOI = featureListOI.getListElementObjectInspector();
        String keyTypeName = featureRawOI.getTypeName();
        if (!STRING_TYPE_NAME.equals(keyTypeName) && !INT_TYPE_NAME.equals(keyTypeName)
                && !BIGINT_TYPE_NAME.equals(keyTypeName)) {
            throw new UDFArgumentTypeException(0,
                    "1st argument must be Map of key type [Int|BitInt|Text]: " + keyTypeName);
        }
        this.parseFeature = STRING_TYPE_NAME.equals(keyTypeName);
        return HiveUtils.asPrimitiveObjectInspector(featureRawOI);
    }

    protected StructObjectInspector getReturnOI(ObjectInspector labelRawOI, ObjectInspector featureRawOI) {
        ArrayList<String> fieldNames = new ArrayList<String>();
        ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();

        fieldNames.add("label");
        ObjectInspector labelOI = ObjectInspectorUtils.getStandardObjectInspector(labelRawOI);
        fieldOIs.add(labelOI);
        fieldNames.add("feature");
        ObjectInspector featureOI = ObjectInspectorUtils.getStandardObjectInspector(featureRawOI);
        fieldOIs.add(featureOI);
        fieldNames.add("weight");
        fieldOIs.add(PrimitiveObjectInspectorFactory.writableFloatObjectInspector);
        if (useCovariance()) {
            fieldNames.add("covar");
            fieldOIs.add(PrimitiveObjectInspectorFactory.writableFloatObjectInspector);
        }

        return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
    }

    @Override
    public void process(Object[] args) throws HiveException {
        List<?> features = (List<?>) featureListOI.getList(args[0]);
        FeatureValue[] featureVector = parseFeatures(features);
        if (featureVector == null) {
            return;
        }
        Object label = ObjectInspectorUtils.copyToStandardObject(args[1], labelInputOI);
        if (label == null) {
            throw new UDFArgumentException("label value must not be NULL");
        }

        count++;
        train(featureVector, label);
    }

    @Nullable
    protected final FeatureValue[] parseFeatures(@Nonnull final List<?> features) {
        final int size = features.size();
        if (size == 0) {
            return null;
        }

        final ObjectInspector featureInspector = featureListOI.getListElementObjectInspector();
        final FeatureValue[] featureVector = new FeatureValue[size];
        for (int i = 0; i < size; i++) {
            Object f = features.get(i);
            if (f == null) {
                continue;
            }
            final FeatureValue fv;
            if (parseFeature) {
                fv = FeatureValue.parse(f);
            } else {
                Object k = ObjectInspectorUtils.copyToStandardObject(f, featureInspector);
                fv = new FeatureValue(k, 1.f);
            }
            featureVector[i] = fv;
        }
        return featureVector;
    }

    protected abstract void train(@Nonnull final FeatureValue[] features, @Nonnull final Object actual_label);

    protected final PredictionResult classify(@Nonnull final FeatureValue[] features) {
        float maxScore = Float.MIN_VALUE;
        Object maxScoredLabel = null;

        for (Map.Entry<Object, PredictionModel> label2map : label2model.entrySet()) {// for each class
            Object label = label2map.getKey();
            PredictionModel model = label2map.getValue();
            float score = calcScore(model, features);
            if (maxScoredLabel == null || score > maxScore) {
                maxScore = score;
                maxScoredLabel = label;
            }
        }

        return new PredictionResult(maxScoredLabel, maxScore);
    }

    protected Margin getMargin(@Nonnull final FeatureValue[] features, final Object actual_label) {
        float correctScore = 0.f;
        Object maxAnotherLabel = null;
        float maxAnotherScore = 0.f;

        for (Map.Entry<Object, PredictionModel> label2map : label2model.entrySet()) {// for each class
            Object label = label2map.getKey();
            PredictionModel model = label2map.getValue();
            float score = calcScore(model, features);
            if (label.equals(actual_label)) {
                correctScore = score;
            } else {
                if (maxAnotherLabel == null || score > maxAnotherScore) {
                    maxAnotherLabel = label;
                    maxAnotherScore = score;
                }
            }
        }
        return new Margin(correctScore, maxAnotherLabel, maxAnotherScore);
    }

    protected Margin getMarginAndVariance(@Nonnull final FeatureValue[] features, final Object actual_label) {
        return getMarginAndVariance(features, actual_label, false);
    }

    protected Margin getMarginAndVariance(@Nonnull final FeatureValue[] features, final Object actual_label,
            boolean nonZeroVariance) {
        float correctScore = 0.f;
        float correctVariance = 0.f;
        Object maxAnotherLabel = null;
        float maxAnotherScore = 0.f;
        float maxAnotherVariance = 0.f;

        if (nonZeroVariance && label2model.isEmpty()) {// for initial call
            float var = 2.f * calcVariance(features);
            return new Margin(correctScore, maxAnotherLabel, maxAnotherScore).variance(var);
        }

        for (Map.Entry<Object, PredictionModel> label2map : label2model.entrySet()) {// for each class
            Object label = label2map.getKey();
            PredictionModel model = label2map.getValue();
            PredictionResult predicted = calcScoreAndVariance(model, features);
            float score = predicted.getScore();

            if (label.equals(actual_label)) {
                correctScore = score;
                correctVariance = predicted.getVariance();
            } else {
                if (maxAnotherLabel == null || score > maxAnotherScore) {
                    maxAnotherLabel = label;
                    maxAnotherScore = score;
                    maxAnotherVariance = predicted.getVariance();
                }
            }
        }

        float var = correctVariance + maxAnotherVariance;
        return new Margin(correctScore, maxAnotherLabel, maxAnotherScore).variance(var);
    }

    protected final float squaredNorm(@Nonnull final FeatureValue[] features) {
        float squared_norm = 0.f;
        for (FeatureValue f : features) {// a += w[i] * x[i]
            if (f == null) {
                continue;
            }
            final float v = f.getValueAsFloat();
            squared_norm += (v * v);
        }
        return squared_norm;
    }

    protected final float calcScore(@Nonnull final PredictionModel model, @Nonnull final FeatureValue[] features) {
        float score = 0.f;
        for (FeatureValue f : features) {// a += w[i] * x[i]
            if (f == null) {
                continue;
            }
            final Object k = f.getFeature();
            final float v = f.getValueAsFloat();

            float old_w = model.getWeight(k);
            if (old_w != 0f) {
                score += (old_w * v);
            }
        }
        return score;
    }

    protected final float calcVariance(@Nonnull final FeatureValue[] features) {
        float variance = 0.f;
        for (FeatureValue f : features) {// a += w[i] * x[i]
            if (f == null) {
                continue;
            }
            float v = f.getValueAsFloat();
            variance += v * v;
        }
        return variance;
    }

    protected final PredictionResult calcScoreAndVariance(@Nonnull final PredictionModel model,
            @Nonnull final FeatureValue[] features) {
        float score = 0.f;
        float variance = 0.f;

        for (FeatureValue f : features) {// a += w[i] * x[i]
            if (f == null) {
                continue;
            }
            final Object k = f.getFeature();
            final float v = f.getValueAsFloat();

            IWeightValue old_w = model.get(k);
            if (old_w == null) {
                variance += (1.f * v * v);
            } else {
                score += (old_w.get() * v);
                variance += (old_w.getCovariance() * v * v);
            }
        }

        return new PredictionResult(score).variance(variance);
    }

    protected void update(@Nonnull final FeatureValue[] features, float coeff, Object actual_label,
            Object missed_label) {
        assert (actual_label != null);
        if (actual_label.equals(missed_label)) {
            throw new IllegalArgumentException("Actual label equals to missed label: " + actual_label);
        }

        PredictionModel model2add = label2model.get(actual_label);
        if (model2add == null) {
            model2add = createModel();
            label2model.put(actual_label, model2add);
        }
        PredictionModel model2sub = null;
        if (missed_label != null) {
            model2sub = label2model.get(missed_label);
            if (model2sub == null) {
                model2sub = createModel();
                label2model.put(missed_label, model2sub);
            }
        }

        for (FeatureValue f : features) {// w[f] += y * x[f]
            if (f == null) {
                continue;
            }
            final Object k = f.getFeature();
            final float v = f.getValueAsFloat();

            float old_trueclass_w = model2add.getWeight(k);
            float add_w = old_trueclass_w + (coeff * v);
            model2add.set(k, new WeightValue(add_w));

            if (model2sub != null) {
                float old_falseclass_w = model2sub.getWeight(k);
                float sub_w = old_falseclass_w - (coeff * v);
                model2sub.set(k, new WeightValue(sub_w));
            }
        }
    }

    @Override
    public final void close() throws HiveException {
        super.close();
        if (label2model != null) {
            long numForwarded = 0L;
            long numMixed = 0L;
            if (useCovariance()) {
                final WeightValueWithCovar probe = new WeightValueWithCovar();
                final Object[] forwardMapObj = new Object[4];
                final FloatWritable fv = new FloatWritable();
                final FloatWritable cov = new FloatWritable();
                for (Map.Entry<Object, PredictionModel> entry : label2model.entrySet()) {
                    Object label = entry.getKey();
                    forwardMapObj[0] = label;
                    PredictionModel model = entry.getValue();
                    numMixed += model.getNumMixed();
                    IMapIterator<Object, IWeightValue> itor = model.entries();
                    while (itor.next() != -1) {
                        itor.getValue(probe);
                        if (!probe.isTouched()) {
                            continue; // skip outputting untouched weights
                        }
                        Object k = itor.getKey();
                        fv.set(probe.get());
                        cov.set(probe.getCovariance());
                        forwardMapObj[1] = k;
                        forwardMapObj[2] = fv;
                        forwardMapObj[3] = cov;
                        forward(forwardMapObj);
                        numForwarded++;
                    }
                }
            } else {
                final WeightValue probe = new WeightValue();
                final Object[] forwardMapObj = new Object[3];
                final FloatWritable fv = new FloatWritable();
                for (Map.Entry<Object, PredictionModel> entry : label2model.entrySet()) {
                    Object label = entry.getKey();
                    forwardMapObj[0] = label;
                    PredictionModel model = entry.getValue();
                    numMixed += model.getNumMixed();
                    IMapIterator<Object, IWeightValue> itor = model.entries();
                    while (itor.next() != -1) {
                        itor.getValue(probe);
                        if (!probe.isTouched()) {
                            continue; // skip outputting untouched weights
                        }
                        Object k = itor.getKey();
                        fv.set(probe.get());
                        forwardMapObj[1] = k;
                        forwardMapObj[2] = fv;
                        forward(forwardMapObj);
                        numForwarded++;
                    }
                }
            }
            this.label2model = null;
            logger.info("Trained a prediction model using " + count + " training examples"
                    + (numMixed > 0 ? "( numMixed: " + numMixed + " )" : ""));
            logger.info("Forwarded the prediction model of " + numForwarded + " rows");
        }
    }

    protected void loadPredictionModel(Map<Object, PredictionModel> label2model, String filename,
            PrimitiveObjectInspector labelOI, PrimitiveObjectInspector featureOI) {
        final StopWatch elapsed = new StopWatch();
        final long lines;
        try {
            if (useCovariance()) {
                lines = loadPredictionModel(label2model, new File(filename), labelOI, featureOI,
                        writableFloatObjectInspector, writableFloatObjectInspector);
            } else {
                lines = loadPredictionModel(label2model, new File(filename), labelOI, featureOI,
                        writableFloatObjectInspector);
            }
        } catch (IOException e) {
            throw new RuntimeException("Failed to load a model: " + filename, e);
        } catch (SerDeException e) {
            throw new RuntimeException("Failed to load a model: " + filename, e);
        }
        if (!label2model.isEmpty()) {
            long totalFeatures = 0L;
            StringBuilder statsBuf = new StringBuilder(256);
            for (Map.Entry<Object, PredictionModel> e : label2model.entrySet()) {
                Object label = e.getKey();
                int numFeatures = e.getValue().size();
                statsBuf.append('\n').append("Label: ").append(label).append(", Number of Features: ")
                        .append(numFeatures);
                totalFeatures += numFeatures;
            }
            logger.info("Loaded total " + totalFeatures + " features from distributed cache '" + filename + "' ("
                    + lines + " lines) in " + elapsed + statsBuf);
        }
    }

    private long loadPredictionModel(Map<Object, PredictionModel> label2model, File file,
            PrimitiveObjectInspector labelOI, PrimitiveObjectInspector featureOI,
            WritableFloatObjectInspector weightOI) throws IOException, SerDeException {
        long count = 0L;
        if (!file.exists()) {
            return count;
        }
        if (!file.getName().endsWith(".crc")) {
            if (file.isDirectory()) {
                for (File f : file.listFiles()) {
                    count += loadPredictionModel(label2model, f, labelOI, featureOI, weightOI);
                }
            } else {
                LazySimpleSerDe serde = HiveUtils.getLineSerde(labelOI, featureOI, weightOI);
                StructObjectInspector lineOI = (StructObjectInspector) serde.getObjectInspector();
                StructField c1ref = lineOI.getStructFieldRef("c1");
                StructField c2ref = lineOI.getStructFieldRef("c2");
                StructField c3ref = lineOI.getStructFieldRef("c3");
                PrimitiveObjectInspector c1refOI = (PrimitiveObjectInspector) c1ref.getFieldObjectInspector();
                PrimitiveObjectInspector c2refOI = (PrimitiveObjectInspector) c2ref.getFieldObjectInspector();
                FloatObjectInspector c3refOI = (FloatObjectInspector) c3ref.getFieldObjectInspector();

                BufferedReader reader = null;
                try {
                    reader = HadoopUtils.getBufferedReader(file);
                    String line;
                    while ((line = reader.readLine()) != null) {
                        count++;
                        Text lineText = new Text(line);
                        Object lineObj = serde.deserialize(lineText);
                        List<Object> fields = lineOI.getStructFieldsDataAsList(lineObj);
                        Object f0 = fields.get(0);
                        Object f1 = fields.get(1);
                        Object f2 = fields.get(2);
                        if (f0 == null || f1 == null || f2 == null) {
                            continue; // avoid the case that key or value is null
                        }
                        Object label = c1refOI.getPrimitiveWritableObject(c1refOI.copyObject(f0));
                        PredictionModel model = label2model.get(label);
                        if (model == null) {
                            model = createModel();
                            label2model.put(label, model);
                        }
                        Object k = c2refOI.getPrimitiveWritableObject(c2refOI.copyObject(f1));
                        float v = c3refOI.get(f2);
                        model.set(k, new WeightValue(v, false));
                    }
                } finally {
                    IOUtils.closeQuietly(reader);
                }
            }
        }
        return count;
    }

    private long loadPredictionModel(Map<Object, PredictionModel> label2model, File file,
            PrimitiveObjectInspector labelOI, PrimitiveObjectInspector featureOI,
            WritableFloatObjectInspector weightOI, WritableFloatObjectInspector covarOI)
            throws IOException, SerDeException {
        long count = 0L;
        if (!file.exists()) {
            return count;
        }
        if (!file.getName().endsWith(".crc")) {
            if (file.isDirectory()) {
                for (File f : file.listFiles()) {
                    count += loadPredictionModel(label2model, f, labelOI, featureOI, weightOI, covarOI);
                }
            } else {
                LazySimpleSerDe serde = HiveUtils.getLineSerde(labelOI, featureOI, weightOI, covarOI);
                StructObjectInspector lineOI = (StructObjectInspector) serde.getObjectInspector();
                StructField c1ref = lineOI.getStructFieldRef("c1");
                StructField c2ref = lineOI.getStructFieldRef("c2");
                StructField c3ref = lineOI.getStructFieldRef("c3");
                StructField c4ref = lineOI.getStructFieldRef("c4");
                PrimitiveObjectInspector c1refOI = (PrimitiveObjectInspector) c1ref.getFieldObjectInspector();
                PrimitiveObjectInspector c2refOI = (PrimitiveObjectInspector) c2ref.getFieldObjectInspector();
                FloatObjectInspector c3refOI = (FloatObjectInspector) c3ref.getFieldObjectInspector();
                FloatObjectInspector c4refOI = (FloatObjectInspector) c4ref.getFieldObjectInspector();

                BufferedReader reader = null;
                try {
                    reader = HadoopUtils.getBufferedReader(file);
                    String line;
                    while ((line = reader.readLine()) != null) {
                        count++;
                        Text lineText = new Text(line);
                        Object lineObj = serde.deserialize(lineText);
                        List<Object> fields = lineOI.getStructFieldsDataAsList(lineObj);
                        Object f0 = fields.get(0);
                        Object f1 = fields.get(1);
                        Object f2 = fields.get(2);
                        Object f3 = fields.get(3);
                        if (f0 == null || f1 == null || f2 == null) {
                            continue; // avoid unexpected case
                        }
                        Object label = c1refOI.getPrimitiveWritableObject(c1refOI.copyObject(f0));
                        PredictionModel model = label2model.get(label);
                        if (model == null) {
                            model = createModel();
                            label2model.put(label, model);
                        }
                        Object k = c2refOI.getPrimitiveWritableObject(c2refOI.copyObject(f1));
                        float v = c3refOI.get(f2);
                        float cov = (f3 == null) ? WeightValueWithCovar.DEFAULT_COVAR : c4refOI.get(f3);
                        model.set(k, new WeightValueWithCovar(v, cov, false));
                    }
                } finally {
                    IOUtils.closeQuietly(reader);
                }
            }
        }
        return count;
    }
}