Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.knittingboar.sgd.iterativereduce; import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.classifier.sgd.L1; import org.apache.mahout.classifier.sgd.UniformPrior; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.RandomAccessSparseVector; import org.apache.mahout.math.Vector; import com.cloudera.knittingboar.messages.iterativereduce.ParameterVectorGradient; import com.cloudera.knittingboar.messages.iterativereduce.ParameterVectorGradientUpdatable; import com.cloudera.knittingboar.metrics.POLRMetrics; import com.cloudera.knittingboar.records.CSVBasedDatasetRecordFactory; import com.cloudera.knittingboar.records.RCV1RecordFactory; import com.cloudera.knittingboar.records.RecordFactory; import com.cloudera.knittingboar.records.TwentyNewsgroupsRecordFactory; import com.cloudera.knittingboar.sgd.GradientBuffer; import com.cloudera.knittingboar.sgd.POLRModelParameters; import com.cloudera.knittingboar.sgd.ParallelOnlineLogisticRegression; //import com.cloudera.knittingboar.yarn.CompoundAdditionWorker; import com.cloudera.iterativereduce.ComputableWorker; import com.cloudera.iterativereduce.yarn.appworker.ApplicationWorker; import com.cloudera.iterativereduce.io.RecordParser; import com.cloudera.iterativereduce.io.TextRecordParser; import com.google.common.collect.Lists; /** * The Worker node for IterativeReduce - performs work on the shard of input * data for the parallel iterative algorithm - runs the SGD algorithm locally on * its shard of data * * @author jpatterson * */ public class POLRWorkerNode extends POLRNodeBase implements ComputableWorker<ParameterVectorGradientUpdatable> { private static final Log LOG = LogFactory.getLog(POLRWorkerNode.class); int masterTotal = 0; public ParallelOnlineLogisticRegression polr = null; // lmp.createRegression(); public POLRModelParameters polr_modelparams; public String internalID = "0"; private RecordFactory VectorFactory = null; private TextRecordParser lineParser = null; private boolean IterationComplete = false; private int CurrentIteration = 0; // basic stats tracking POLRMetrics metrics = new POLRMetrics(); double averageLineCount = 0.0; int k = 0; double step = 0.0; int[] bumps = new int[] { 1, 2, 5 }; double lineCount = 0; /** * Sends a full copy of the multinomial logistic regression array of parameter * vectors to the master - this method plugs the local parameter vector into * the message */ public ParameterVectorGradient GenerateUpdate() { ParameterVectorGradient gradient = new ParameterVectorGradient(); gradient.parameter_vector = this.polr.getBeta().clone(); // this.polr.getGamma().getMatrix().clone(); gradient.SrcWorkerPassCount = this.LocalBatchCountForIteration; if (this.lineParser.hasMoreRecords()) { gradient.IterationComplete = 0; } else { gradient.IterationComplete = 1; } gradient.CurrentIteration = this.CurrentIteration; gradient.AvgLogLikelihood = (new Double(metrics.AvgLogLikelihood)).floatValue(); gradient.PercentCorrect = (new Double(metrics.AvgCorrect * 100)).floatValue(); gradient.TrainedRecords = (new Long(metrics.TotalRecordsProcessed)).intValue(); return gradient; } /** * The IR::Compute method - this is where we do the next batch of records for * SGD */ @Override public ParameterVectorGradientUpdatable compute() { Text value = new Text(); long batch_vec_factory_time = 0; boolean result = true; //boolean processBatch = false; /* if (this.LocalPassCount > this.GlobalPassCount) { // we need to sit this one out System.out.println("Worker " + this.internalID + " is ahead of global pass count [" + this.LocalPassCount + ":" + this.GlobalPassCount + "] "); processBatch = true; } if (this.LocalPassCount >= this.NumberPasses) { // learning is done, terminate System.out.println("Worker " + this.internalID + " is done [" + this.LocalPassCount + ":" + this.GlobalPassCount + "] "); processBatch = false; } if (processBatch) { */ // if (this.lineParser.hasMoreRecords()) { //for (int x = 0; x < this.BatchSize; x++) { while (this.lineParser.hasMoreRecords()) { try { result = this.lineParser.next(value); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } if (result) { long startTime = System.currentTimeMillis(); Vector v = new RandomAccessSparseVector(this.FeatureVectorSize); int actual = -1; try { actual = this.VectorFactory.processLine(value.toString(), v); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } long endTime = System.currentTimeMillis(); batch_vec_factory_time += (endTime - startTime); // calc stats --------- double mu = Math.min(k + 1, 200); double ll = this.polr.logLikelihood(actual, v); metrics.AvgLogLikelihood = metrics.AvgLogLikelihood + (ll - metrics.AvgLogLikelihood) / mu; if (Double.isNaN(metrics.AvgLogLikelihood)) { metrics.AvgLogLikelihood = 0; } Vector p = new DenseVector(this.num_categories); this.polr.classifyFull(p, v); int estimated = p.maxValueIndex(); int correct = (estimated == actual ? 1 : 0); metrics.AvgCorrect = metrics.AvgCorrect + (correct - metrics.AvgCorrect) / mu; this.polr.train(actual, v); k++; metrics.TotalRecordsProcessed = k; // if (x == this.BatchSize - 1) { /* System.err .printf( "Worker %s:\t Iteration: %s, Trained Recs: %10d, AvgLL: %10.3f, Percent Correct: %10.2f, VF: %d\n", this.internalID, this.CurrentIteration, k, metrics.AvgLogLikelihood, metrics.AvgCorrect * 100, batch_vec_factory_time); */ // } this.polr.close(); } else { // this.LocalBatchCountForIteration++; // this.input_split.ResetToStartOfSplit(); // nothing else to process in split! // break; } // if } // for the batch size System.err.printf( "Worker %s:\t Iteration: %s, Trained Recs: %10d, AvgLL: %10.3f, Percent Correct: %10.2f, VF: %d\n", this.internalID, this.CurrentIteration, k, metrics.AvgLogLikelihood, metrics.AvgCorrect * 100, batch_vec_factory_time); /* } else { System.err .printf( "Worker %s:\t Trained Recs: %10d, AvgLL: %10.3f, Percent Correct: %10.2f, [Done With Iteration]\n", this.internalID, k, metrics.AvgLogLikelihood, metrics.AvgCorrect * 100); } // if */ return new ParameterVectorGradientUpdatable(this.GenerateUpdate()); } public ParameterVectorGradientUpdatable getResults() { return new ParameterVectorGradientUpdatable(GenerateUpdate()); } /** * This is called when we recieve an update from the master * * here we - replace the gradient vector with the new global gradient vector * */ @Override public void update(ParameterVectorGradientUpdatable t) { // masterTotal = t.get(); ParameterVectorGradient global_update = t.get(); // set the local parameter vector to the global aggregate ("beta") this.polr.SetBeta(global_update.parameter_vector); // update global count this.GlobalBatchCountForIteration = global_update.GlobalPassCount; // flush the local gradient delta buffer ("gamma") this.polr.FlushGamma(); /* if (global_update.IterationComplete == 0) { this.IterationComplete = false; } else { this.IterationComplete = true; // when this happens, it will trip the ApplicationWorkerService loop and iteration will increment } */ } @Override public void setup(Configuration c) { this.conf = c; try { this.num_categories = this.conf.getInt("com.cloudera.knittingboar.setup.numCategories", 2); // feature vector size this.FeatureVectorSize = LoadIntConfVarOrException("com.cloudera.knittingboar.setup.FeatureVectorSize", "Error loading config: could not load feature vector size"); // feature vector size this.BatchSize = this.conf.getInt("com.cloudera.knittingboar.setup.BatchSize", 200); // this.NumberPasses = this.conf.getInt( // "com.cloudera.knittingboar.setup.NumberPasses", 1); // app.iteration.count this.NumberPasses = this.conf.getInt("app.iteration.count", 1); // protected double Lambda = 1.0e-4; this.Lambda = Double.parseDouble(this.conf.get("com.cloudera.knittingboar.setup.Lambda", "1.0e-4")); // protected double LearningRate = 50; this.LearningRate = Double .parseDouble(this.conf.get("com.cloudera.knittingboar.setup.LearningRate", "10")); // maps to either CSV, 20newsgroups, or RCV1 this.RecordFactoryClassname = LoadStringConfVarOrException( "com.cloudera.knittingboar.setup.RecordFactoryClassname", "Error loading config: could not load RecordFactory classname"); if (this.RecordFactoryClassname.equals(RecordFactory.CSV_RECORDFACTORY)) { // so load the CSV specific stuff ---------- // predictor label names this.PredictorLabelNames = LoadStringConfVarOrException( "com.cloudera.knittingboar.setup.PredictorLabelNames", "Error loading config: could not load predictor label names"); // predictor var types this.PredictorVariableTypes = LoadStringConfVarOrException( "com.cloudera.knittingboar.setup.PredictorVariableTypes", "Error loading config: could not load predictor variable types"); // target variables this.TargetVariableName = LoadStringConfVarOrException( "com.cloudera.knittingboar.setup.TargetVariableName", "Error loading config: Target Variable Name"); // column header names this.ColumnHeaderNames = LoadStringConfVarOrException( "com.cloudera.knittingboar.setup.ColumnHeaderNames", "Error loading config: Column Header Names"); // System.out.println("LoadConfig(): " + this.ColumnHeaderNames); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } this.SetupPOLR(); } private void SetupPOLR() { // do splitting strings into arrays here... String[] predictor_label_names = this.PredictorLabelNames.split(","); String[] variable_types = this.PredictorVariableTypes.split(","); polr_modelparams = new POLRModelParameters(); polr_modelparams.setTargetVariable(this.TargetVariableName); polr_modelparams.setNumFeatures(this.FeatureVectorSize); polr_modelparams.setUseBias(true); List<String> typeList = Lists.newArrayList(); for (int x = 0; x < variable_types.length; x++) { typeList.add(variable_types[x]); } List<String> predictorList = Lists.newArrayList(); for (int x = 0; x < predictor_label_names.length; x++) { predictorList.add(predictor_label_names[x]); } // where do these come from? polr_modelparams.setTypeMap(predictorList, typeList); polr_modelparams.setLambda(this.Lambda); // based on defaults - match // command line polr_modelparams.setLearningRate(this.LearningRate); // based on defaults - // match command line // setup record factory stuff here --------- if (RecordFactory.TWENTYNEWSGROUPS_RECORDFACTORY.equals(this.RecordFactoryClassname)) { this.VectorFactory = new TwentyNewsgroupsRecordFactory("\t"); } else if (RecordFactory.RCV1_RECORDFACTORY.equals(this.RecordFactoryClassname)) { this.VectorFactory = new RCV1RecordFactory(); } else { // it defaults to the CSV record factor, but a custom one this.VectorFactory = new CSVBasedDatasetRecordFactory(this.TargetVariableName, polr_modelparams.getTypeMap()); ((CSVBasedDatasetRecordFactory) this.VectorFactory).firstLine(this.ColumnHeaderNames); } polr_modelparams.setTargetCategories(this.VectorFactory.getTargetCategories()); // ----- this normally is generated from the POLRModelParams ------ this.polr = new ParallelOnlineLogisticRegression(this.num_categories, this.FeatureVectorSize, new UniformPrior()).alpha(1).stepOffset(1000).decayExponent(0.9).lambda(this.Lambda) .learningRate(this.LearningRate); polr_modelparams.setPOLR(polr); // this.bSetup = true; } @Override public void setRecordParser(RecordParser r) { this.lineParser = (TextRecordParser) r; } /** * only implemented for completeness with the interface, we argued over how to * implement this. - this is currently a legacy artifact */ @Override public ParameterVectorGradientUpdatable compute(List<ParameterVectorGradientUpdatable> records) { // TODO Auto-generated method stub return compute(); } public static void main(String[] args) throws Exception { TextRecordParser parser = new TextRecordParser(); POLRWorkerNode pwn = new POLRWorkerNode(); ApplicationWorker<ParameterVectorGradientUpdatable> aw = new ApplicationWorker<ParameterVectorGradientUpdatable>( parser, pwn, ParameterVectorGradientUpdatable.class); ToolRunner.run(aw, args); } /* @Override public int getCurrentGlobalIteration() { // TODO Auto-generated method stub return 0; } */ /** * returns false if we're done with iterating over the data * * @return */ @Override public boolean IncrementIteration() { this.CurrentIteration++; this.IterationComplete = false; this.lineParser.reset(); System.out.println("IncIteration > " + this.CurrentIteration + ", " + this.NumberPasses); if (this.CurrentIteration >= this.NumberPasses) { System.out.println("POLRWorkerNode: [ done with all iterations ]"); return false; } return true; } /* @Override public boolean isStillWorkingOnCurrentIteration() { //return this.lineParser.hasMoreRecords(); //return this. return !this.IterationComplete; } */ }