Java tutorial
/** * Copyright 2014 LinkedIn Corp. All rights reserved. * Licensed under the Apache License, Version 2.0 (the "License");you may * not use this file except in compliance with the License.You may obtain a * copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS,WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. */ package com.linkedin.mlease.regression.liblinearfunc; import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.commons.math3.linear.CholeskyDecomposition; import org.apache.commons.math3.linear.DecompositionSolver; import org.apache.commons.math3.linear.MatrixUtils; import org.apache.commons.math3.linear.RealMatrix; import org.apache.hadoop.mapred.Reporter; import com.linkedin.mlease.models.LinearModel; import com.linkedin.mlease.regression.liblinearfunc.LibLinearFunction; import com.linkedin.mlease.regression.liblinearfunc.LogisticRegressionL2; import com.linkedin.mlease.regression.liblinearfunc.LogisticRegressionL2BinaryFeature; import com.linkedin.mlease.utils.Util; import de.bwaldvogel.liblinear.Tron; /** * <p> * Fit a model using LibLinear * </p> * <p> * Example: * </p> * * <pre> * LibLinearDataset dataset = new LibLinearDataset(1.0); // add an intercept/bias * // Or, = new LibLinearBinaryDataset(1.0, false); // to save memory * * // add records into the dataset * for(...){ * ... * dataset.addInstanceJSON(record); // record is a Map of JSON object * } * dataset.finish(); // finish() must be called before training * LibLinear liblinear = new LibLinear(); * liblinear.train(dataset, 1.0, "max_iter=50, epsilon=0.01, positive_weight=5"); * LinearModel model = liblinear.getLinearModel(); * </pre> * <p> * Options: * </p> * <ul> * <li>max_iter: maximum number of iterations</li> * <li>epsilon: precision of the solution</li> * <li>positive_weight: weight to be added to the positive instances</li> * </ul> * * IMPORTANT NOTE: voldemort.serialization.json.JsonReader seems to have a bug * when reading double values. Use string instead. For example, * {... "offset"="-1.23", ...}, instead of {... "offset"=-1.23, ...} * * @author bchen */ public class LibLinear { // fitted parameter vector (in the map format) Map<String, Double> coeff = null; // fitted parameter vector (in the array format) double[] param = null; // posterior variance (only diagonal elements) Map<String, Double> postVarMap = null; // in the map format double[] postVar = null; // in the array format double[] priorMean = null; double[] priorVar = null; public double bias = 0; double epsilon = 0.01; String type = Logistic_L2_primal; int max_iter = 10000; int verbose = 0; double positive_weight = 1; public static String Logistic_L2_primal = "Logistic_L2_primal"; public static String Do_nothing = "Do_nothing"; Reporter reporter = null; long reportFrequency = 10000000; boolean computeFullPostVar = false; double[][] postVarMatrix = null; Map<List<String>, Double> postVarMatrixMap = null; // key: [FeatureName1, FeatureName2] // value: covariance // including FeatureName1==FeatureName2 void parseOption(String option) throws Exception { if (option == null) return; if ("".equals(option)) return; String[] token = option.split("\\s*,\\s*"); for (int i = 0; i < token.length; i++) { String[] pair = token[i].split("\\s*=\\s*"); if (pair.length != 2) throw new Exception("Unknown option specification: '" + token[i] + "' in '" + option + "'"); try { if (pair[0].equals("epsilon")) { epsilon = Util.atof(pair[1]); } else if (pair[0].equals("type")) { type = pair[1]; } else if (pair[0].equals("max_iter")) { max_iter = Util.atoi(pair[1]); } else if (pair[0].equals("verbose")) { verbose = Util.atoi(pair[1]); } else if (pair[0].equals("positive_weight")) { positive_weight = Util.atof(pair[1]); } else throw new Exception(); } catch (Exception e) { throw new Exception("Invalid option specification: '" + token[i] + "' in '" + option + "'"); } } } /** * Whether to compute the full posterior variance-covariance matrix * @param compute */ public void setComputeFullPostVar(boolean compute) { computeFullPostVar = compute; } /** * Set the reporter to report progress after processing n instances * * @param rep * @param n */ public void setReporter(Reporter rep, long n) { reporter = rep; if (n < 1000) n = 1000; reportFrequency = n; } /** * Train (i.e., fit or build) a model using input dataset according to the options. * * @param dataset * Training dataset * @param initParam * Initial parameter vector (can be null, meaning all 0) * @param priorMean * The prior mean of the parameter vector (can be null, meaning all 0) * @param priorVar * The prior variance of the parameter vector (can be null, meaning all prior * variances = priorVar_rest) * @param defaultPriorVar * The prior variance for the parameters not specified in priorVar. * @param option * A comma-separated list of options * @throws Exception */ public void train(LibLinearDataset dataset, Map<String, Double> initParam, Map<String, Double> priorMean, Map<String, Double> priorVar, double defaultPriorVar, String option) throws Exception { train(dataset, initParam, priorMean, priorVar, 0.0, defaultPriorVar, option); } public void train(LibLinearDataset dataset, Map<String, Double> initParam, Map<String, Double> priorMean, Map<String, Double> priorVar, double defaultPriorMean, double defaultPriorVar, String option) throws Exception { train(dataset, initParam, priorMean, priorVar, defaultPriorMean, defaultPriorVar, option, false); } public void train(LibLinearDataset dataset, Map<String, Double> initParam, Map<String, Double> priorMean, Map<String, Double> priorVar, double defaultPriorMean, double defaultPriorVar, String option, boolean computePosteriorVar) throws Exception { if (!dataset.isFinished()) throw new IOException("Cannot train a model using unfinished dataset"); bias = dataset.bias; parseOption(option); // setup initial parameter vector param = new double[dataset.nFeatures()]; initSetup(param, initParam, dataset, 0.0); // setup prior mean this.priorMean = new double[dataset.nFeatures()]; initSetup(this.priorMean, priorMean, dataset, defaultPriorMean); // setup prior var this.priorVar = new double[dataset.nFeatures()]; initSetup(this.priorVar, priorVar, dataset, defaultPriorVar); // setup initial posterior variance postVar = null; postVarMap = null; postVarMatrix = null; postVarMatrixMap = null; if (computePosteriorVar) { // initialize the diagonal posterior variance postVar = new double[this.priorVar.length]; for (int i = 0; i < postVar.length; i++) postVar[i] = this.priorVar[i]; if (computeFullPostVar) { // initialize the full posterior variance matrix postVarMatrix = new double[postVar.length][]; for (int i = 0; i < postVarMatrix.length; i++) { postVarMatrix[i] = new double[postVarMatrix.length]; for (int j = 0; j < postVarMatrix.length; j++) { if (i == j) postVarMatrix[i][j] = this.priorVar[i]; else postVarMatrix[i][j] = 0; } } } } int pos = 0; for (int i = 0; i < dataset.nInstances(); i++) if (dataset.y[i] == 1) pos++; int neg = dataset.nInstances() - pos; if (type.equals(Logistic_L2_primal)) { double multiplier = 1; LibLinearFunction func; if (dataset instanceof LibLinearBinaryDataset) { func = new LogisticRegressionL2BinaryFeature((LibLinearBinaryDataset) dataset, this.priorMean, this.priorVar, multiplier, positive_weight, 1); } else { func = new LogisticRegressionL2(dataset, this.priorMean, this.priorVar, multiplier, positive_weight, 1); } if (reporter != null) { reporter.setStatus("Start LibLinear of type " + type); func.setReporter(reporter, reportFrequency); } // Find the posterior mode Tron tron = new Tron(func, epsilon * Math.min(pos, neg) / dataset.nInstances(), max_iter); tron.tron(param); // Compute the posterior variance if (computePosteriorVar) { if (computeFullPostVar) { // Compute the full posterior variance matrix func.hessian(param, postVarMatrix); RealMatrix H = MatrixUtils.createRealMatrix(postVarMatrix); CholeskyDecomposition decomp = new CholeskyDecomposition(H); DecompositionSolver solver = decomp.getSolver(); RealMatrix Var = solver.getInverse(); postVarMatrix = Var.getData(); for (int i = 0; i < postVar.length; i++) postVar[i] = postVarMatrix[i][i]; } else { // Compute the diagonal elements of the variance func.hessianDiagonal(param, postVar); for (int i = 0; i < postVar.length; i++) postVar[i] = 1.0 / postVar[i]; } } } else if (type.equals(Do_nothing)) { // do nothing } else throw new Exception("Unknown type: " + type); coeff = new HashMap<String, Double>(); if (computePosteriorVar) postVarMap = new HashMap<String, Double>(); for (int index = 1; index <= dataset.nFeatures(); index++) { String featureName = dataset.getFeatureName(index); coeff.put(featureName, param[index - 1]); if (computePosteriorVar) postVarMap.put(featureName, postVar[index - 1]); } if (computePosteriorVar && computeFullPostVar) { postVarMatrixMap = new HashMap<List<String>, Double>(); for (int i = 1; i <= dataset.nFeatures(); i++) { String name_i = dataset.getFeatureName(i); for (int j = 1; j <= dataset.nFeatures(); j++) { double cov = postVarMatrix[i - 1][j - 1]; if (cov != 0) { String name_j = dataset.getFeatureName(j); ArrayList<String> pair = new ArrayList<String>(2); pair.add(name_i); pair.add(name_j); postVarMatrixMap.put(pair, cov); } } } } // check for features with non-zero prior that do not appear in the dataset if (priorMean != null) { for (String key : priorMean.keySet()) { if (!coeff.containsKey(key)) { coeff.put(key, priorMean.get(key)); } } } if (priorVar != null && computePosteriorVar) { for (String key : priorVar.keySet()) { if (!postVarMap.containsKey(key)) postVarMap.put(key, priorVar.get(key)); if (computeFullPostVar) { ArrayList<String> pair = new ArrayList<String>(2); pair.add(key); pair.add(key); if (!postVarMatrixMap.containsKey(pair)) postVarMatrixMap.put(pair, priorVar.get(key)); } } } } /** * Same as train(dataset, null, null, null, priorVar, option) * * @param dataset * @param priorVar * @param option * @throws Exception */ public void train(LibLinearDataset dataset, double priorVar, String option) throws Exception { train(dataset, null, null, null, priorVar, option); } /** * Get the regression coefficients as a map * @return a map where the key is a feature name and the value is the * posterior mean of the corresponding regression coefficient */ public Map<String, Double> getParamMap() { return coeff; } /** * Get the posterior variances of regression coefficients as a map. * This method only returns the diagonal elements of the * variance-covariance matrix. * It returns null if computePosteriorVar==false when calling the * train method. * @return a map where the key is a feature name and the value is the * posterior variance of the corresponding regression coefficient */ public Map<String, Double> getPostVarMap() { return postVarMap; } /** * Get the full posterior variance-covariance matrix of regression coefficients * as a map. * It returns null if computePosteriorVar==false when calling the * train method or computeFullPostVar==false. * <b>Nonexistence of a pair of features means zero posterior correlation * between the corresponding two regression coefficients.</b> * @return a map where the key is a pair of feature names and the value is the * posterior covariance of the corresponding two regression coefficients. * The pair of feature names can be the same; in this case, the value * is the posterior variance. */ public Map<List<String>, Double> getPostVarMatrixMap() { return postVarMatrixMap; } /** * Return the fitted linear model * * @return * @throws Exception */ public LinearModel getLinearModel() throws Exception { if (coeff == null) throw new Exception( "This model has not been built. Please call train() before calling getLinearModel()."); LinearModel model = null; if (bias > 0) { model = new LinearModel(LibLinearDataset.INTERCEPT_NAME, coeff); } else { model = new LinearModel(0.0, coeff); } return model; } void initSetup(double[] param, Map<String, Double> map, LibLinearDataset dataset, double defaultValue) { for (int i = 0; i < param.length; i++) param[i] = defaultValue; if (map != null) { Iterator<Entry<String, Double>> iter = map.entrySet().iterator(); while (iter.hasNext()) { Entry<String, Double> entry = iter.next(); if (entry.getKey() == null) throw new RuntimeException("input key is null"); int index = dataset.getFeatureIndex(entry.getKey()); if (index == -1) continue; // the feature name does not appear in the dataset param[index - 1] = entry.getValue(); // index starts from 1 } } } static final String RUN_TRAIN = "train"; static final String RUN_PREDICT = "predict"; static void cmd_line_error(String msg, String cmd) { System.err.println("\nERROR: " + msg); System.err.println("\n" + cmd); System.exit(0); } /** * Command-line tool * * <pre> * java -cp target/regression-0.1-uber.jar com.linkedin.lab.regression.LibLinear * </pre> * * @param args * @throws Exception */ public static void main(String[] args) throws Exception { String cmd = "Input parameters (separated by space): \n" + " run:<command> (required) train or predict\n" + " ftype:<file_type> (required) libsvm or json\n" + " data:<file_name> (required) Input data file of the specified type\n" + " out:<file_name> (required) Output file\n" + " bias:<bias> (optional) Set to 0 if you do not want to add an\n" + " bias/intercept term\n" + " Set to 1 if you want to add a feature with\n" + " value 1 to every instance\n" + " Default: 0\n" + " param:<file_name> (optional) for run:train, it specifies the prior mean\n" + " (required) for run:predict, it specifies the model\n" + " File format: <featureName>=<value> per line\n" + " priorVar:<var> (required) for run:train, <var> is the a number\n" + " (not used) for run:predict\n" + " init:<file_name> (optional) for run:train, it specifies the initial value\n" + " File format: <featureName>=<value> per line\n" + " posteriorVar:1/0 (optional) Whether to compute posterior variances\n" + " Default: 1\n" + " posteriorCov:1/0 (optional) Whether to compute posterior covariances\n" + " Default: 0\n" + " binaryFeature:1/0 (optional) Whether all of the input features are binary\n" + " useShort:1/0 (optional) Whether to use short to store feature indices\n" + " option:<options> (optional) Comma-separated list of options\n" + " No space is allowed in <options>\n" + " Eg: max_iter=5,epsilon=0.01,positive_weight=2\n" + " (not used) for run:predict\n"; if (args.length < 3) { System.out.println("\n" + cmd); System.exit(0); } // Read the input parameters String run = null; String ftype = null; File dataFile = null; File outFile = null; double bias = 0; File paramFile = null; File initFile = null; double priorVar = Double.NaN; String option = null; boolean binaryFeature = false; boolean useShort = false; boolean computePostVar = true; boolean computePostCov = false; for (int i = 0; i < args.length; i++) { if (args[i] == null) continue; String[] token = args[i].split(":"); if (token.length < 2) cmd_line_error("'" + args[i] + "' is not a valid input parameter string!", cmd); for (int k = 2; k < token.length; k++) token[1] += ":" + token[k]; if (token[0].equals("run")) { run = token[1]; } else if (token[0].equals("ftype")) { ftype = token[1]; } else if (token[0].equals("data")) { dataFile = new File(token[1]); } else if (token[0].equals("out")) { outFile = new File(token[1]); } else if (token[0].equals("bias")) { bias = Double.parseDouble(token[1]); } else if (token[0].equals("param")) { paramFile = new File(token[1]); } else if (token[0].equals("init")) { initFile = new File(token[1]); } else if (token[0].equals("priorVar")) { priorVar = Double.parseDouble(token[1]); } else if (token[0].equals("option")) { option = token[1]; } else if (token[0].equals("binaryFeature")) { binaryFeature = Util.atob(token[1]); } else if (token[0].equals("useShort")) { useShort = Util.atob(token[1]); } else if (token[0].equals("posteriorVar")) { computePostVar = Util.atob(token[1]); } else if (token[0].equals("posteriorCov")) { computePostCov = Util.atob(token[1]); } else cmd_line_error("'" + args[i] + "' is not a valid input parameter string!", cmd); } if (run == null) cmd_line_error("Please specify run:<command>", cmd); if (ftype == null) cmd_line_error("Please specify ftype:<file_type>", cmd); if (dataFile == null) cmd_line_error("Please specify data:<file_name>", cmd); if (outFile == null) cmd_line_error("Please specify out:<file_name>", cmd); if (run.equals(RUN_TRAIN)) { Map<String, Double> priorMean = null; Map<String, Double> initParam = null; if (paramFile != null) { if (!paramFile.exists()) cmd_line_error("Param File '" + paramFile.getPath() + "' does not exist", cmd); priorMean = Util.readStringDoubleMap(paramFile, "="); } if (initFile != null) { if (!initFile.exists()) cmd_line_error("Init File '" + initFile.getPath() + "' does not exist", cmd); initParam = Util.readStringDoubleMap(initFile, "="); } if (priorVar == Double.NaN) cmd_line_error("Please specify priorVar:<var>", cmd); if (!dataFile.exists()) cmd_line_error("Data File '" + dataFile.getPath() + "' does not exist", cmd); LibLinearDataset dataset; if (binaryFeature) { dataset = new LibLinearBinaryDataset(bias, useShort); } else { dataset = new LibLinearDataset(bias); } if ("libsvm".equals(ftype)) { dataset.readFromLibSVM(dataFile); } //else if ("json".equals(ftype)) //{ // dataset.readFromJSON(dataFile); //} else cmd_line_error("Unknown file type 'ftype:" + ftype + "'", cmd); if (computePostCov == true && computePostVar == false) cmd_line_error("Cannot compute posterior covariances with posteriorVar:0", cmd); LibLinear liblinear = new LibLinear(); liblinear.setComputeFullPostVar(computePostCov); liblinear.train(dataset, initParam, priorMean, null, 0.0, priorVar, option, computePostVar); PrintStream out = new PrintStream(outFile); Util.printStringDoubleMap(out, liblinear.getParamMap(), "=", true); out.close(); if (computePostVar) { out = new PrintStream(outFile + ".var"); Util.printStringDoubleMap(out, liblinear.getPostVarMap(), "=", true); out.close(); if (computePostCov) { out = new PrintStream(outFile + ".cov"); Util.printStringListDoubleMap(out, liblinear.getPostVarMatrixMap(), "="); out.close(); } } } else if (run.equals(RUN_PREDICT)) { throw new Exception("run:predict is not supported yet :("); } else cmd_line_error("Unknown run:" + run, cmd); } }