Java tutorial
package layer; /* * Cloud9: A Hadoop toolkit for working with big data * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ //package edu.umd.cloud9.example.bigram; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.nio.channels.GatheringByteChannel; import java.util.Iterator; import java.util.Random; import java.util.StringTokenizer; import java.util.Vector; import java.util.Arrays; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.Reducer.Context; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import org.mortbay.log.Log; import edu.umd.cloud9.io.array.ArrayListOfFloatsWritable; import edu.umd.cloud9.io.pair.PairOfStrings; public class AutoCoder extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(AutoCoder.class); protected static class MyMapper extends Mapper<LongWritable, Text, Text, ModelNode> { private static final Text comp = new Text(); private static final ModelNode model = new ModelNode(); private static int num_train_data = 0; private static final Random rd = new Random(); private static float[][] sample_mem = new float[GlobalUtil.NUM_LAYER + 1][]; //space storing the MCMC samples private static float[][] weights = new float[GlobalUtil.NUM_LAYER + 1][]; //space storing the updating weights (first is not used) private static float[][] bh = new float[GlobalUtil.NUM_LAYER + 1][]; // hidden layer biases (rbm) private static float[][] bv = new float[GlobalUtil.NUM_LAYER + 1][]; // visible layer biases (rbm) private static int NUM_LAYER = GlobalUtil.NUM_LAYER; private static int NODES_INPUT = GlobalUtil.NODES_INPUT; private static final int[] train_len = GlobalUtil.train_len; private static final int[] test_len = GlobalUtil.test_len; private static final int[] nodes_layer = GlobalUtil.nodes_layer; private static float yita_w = GlobalUtil.yita_w, yita_bv = GlobalUtil.yita_bv, yita_bh = GlobalUtil.yita_bh, yita_wt = GlobalUtil.yita_wt, yita_bvt = GlobalUtil.yita_bvt, yita_bht = GlobalUtil.yita_bht; // learning rates private static float mu = GlobalUtil.mu, reg = GlobalUtil.reg; private static int layer_ind = 0; private static float read_float(BufferedReader reader) throws NumberFormatException, IOException { while (reader.ready()) { String line = reader.readLine(); if (line.length() == 0) continue; return Float.parseFloat(line); } return 0; } public void setup(Context context) throws IOException { // load the information of k clusters layer_ind = context.getConfiguration().getInt("layer_ind", 0); String file = context.getConfiguration().get("sidepath"); FSDataInputStream cluster = FileSystem.get(context.getConfiguration()).open(new Path(file)); BufferedReader reader = new BufferedReader(new InputStreamReader(cluster)); // Initialize the memory for MCMC samples for (int k = 0; k < GlobalUtil.NUM_LAYER + 1; k++) { sample_mem[k] = new float[GlobalUtil.nodes_layer[k]]; } // Initialize the memory for weight parameters for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++) { weights[k] = new float[GlobalUtil.nodes_layer[k - 1] * GlobalUtil.nodes_layer[k]]; bv[k] = new float[GlobalUtil.nodes_layer[k - 1]]; bh[k] = new float[GlobalUtil.nodes_layer[k]]; } /* for (int k = 0; k < GlobalUtil.NUM_LAYER + 1; k++) for (int j = 0; j < GlobalUtil.nodes_layer[k]; j++) sample_mem[k][j]=read_float(reader); */ for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++) for (int j = 0; j < GlobalUtil.nodes_layer[k - 1] * GlobalUtil.nodes_layer[k]; j++) weights[k][j] = read_float(reader); for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++) for (int j = 0; j < GlobalUtil.nodes_layer[k - 1]; j++) bv[k][j] = read_float(reader); for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++) for (int j = 0; j < GlobalUtil.nodes_layer[k]; j++) bh[k][j] = read_float(reader); reader.close(); cluster.close(); num_train_data = 0; } @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); StringTokenizer itr = new StringTokenizer(line); float[] data = new float[NODES_INPUT]; int tot = 0; while (itr.hasMoreTokens()) { String curr = itr.nextToken(); data[tot] = Float.parseFloat(curr); tot++; } for (int i = 0; i < nodes_layer[layer_ind - 1]; i++) { sample_mem[layer_ind - 1][i] = data[i]; } num_train_data++; work_update(); } public void cleanup(Context context) throws IOException, InterruptedException { comp.set(String.valueOf(num_train_data)); ArrayListOfFloatsWritable[] W = new ArrayListOfFloatsWritable[GlobalUtil.NUM_LAYER + 1]; ArrayListOfFloatsWritable[] BV = new ArrayListOfFloatsWritable[GlobalUtil.NUM_LAYER + 1]; ArrayListOfFloatsWritable[] BH = new ArrayListOfFloatsWritable[GlobalUtil.NUM_LAYER + 1]; for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++) { W[k] = new ArrayListOfFloatsWritable(weights[k]); BV[k] = new ArrayListOfFloatsWritable(bv[k]); BH[k] = new ArrayListOfFloatsWritable(bh[k]); } model.setWeight(W); model.setBV(BV); model.setBH(BH); context.write(comp, model); } void work_update() { float[] x0 = new float[nodes_layer[layer_ind - 1]]; // data float[] h0 = new float[nodes_layer[layer_ind]]; // hidden float[] x1 = new float[nodes_layer[layer_ind - 1]]; float[] h1 = new float[nodes_layer[layer_ind]]; float[] inc_w = new float[nodes_layer[layer_ind - 1] * nodes_layer[layer_ind]]; // previous increase of weights float[] inc_bv = new float[nodes_layer[layer_ind - 1]]; float[] inc_bh = new float[nodes_layer[layer_ind]]; Arrays.fill(inc_w, 0); Arrays.fill(inc_bv, 0); Arrays.fill(inc_bh, 0); for (int i = 0; i < nodes_layer[layer_ind - 1]; i++) x0[i] = sample_mem[layer_ind - 1][i]; if (layer_ind != NUM_LAYER) { // normal layer //perform real computation GlobalUtil.sigm(h0, bh[layer_ind], weights[layer_ind], x0, nodes_layer[layer_ind], nodes_layer[layer_ind - 1], true);// up sampling for (int j = 0; j < nodes_layer[layer_ind]; j++) sample_mem[layer_ind][j] = h0[j]; for (int i = 0; i < nodes_layer[layer_ind]; i++) { if (rd.nextFloat() < h0[i]) h0[i] = 1; else h0[i] = 0; } GlobalUtil.sigm(x1, bv[layer_ind], weights[layer_ind], h0, nodes_layer[layer_ind], nodes_layer[layer_ind - 1], false);// down sampling GlobalUtil.sigm(h1, bh[layer_ind], weights[layer_ind], x1, nodes_layer[layer_ind], nodes_layer[layer_ind - 1], true); for (int j = 0; j < nodes_layer[layer_ind]; j++) for (int i = 0; i < nodes_layer[layer_ind - 1]; i++) { inc_w[j * nodes_layer[layer_ind - 1] + i] = mu * inc_w[j * nodes_layer[layer_ind - 1] + i] + yita_w * (h0[j] * x0[i] - h1[j] * x1[i] - reg * weights[layer_ind][j * nodes_layer[layer_ind - 1] + i]); weights[layer_ind][j * nodes_layer[layer_ind - 1] + i] = weights[layer_ind][j * nodes_layer[layer_ind - 1] + i] + inc_w[j * nodes_layer[layer_ind - 1] + i]; } for (int j = 0; j < nodes_layer[layer_ind]; j++) { inc_bh[j] = mu * inc_bh[j] + yita_bh * (h0[j] - h1[j] - reg * bh[layer_ind][j]); bh[layer_ind][j] = bh[layer_ind][j] + inc_bh[j]; } for (int i = 0; i < nodes_layer[layer_ind - 1]; i++) { inc_bv[i] = mu * inc_bv[i] + yita_bv * (x0[i] - x1[i] - reg * bv[layer_ind][i]); bv[layer_ind][i] = bv[layer_ind][i] + inc_bv[i]; } // print the layer input data (just for testing) } else { // top layer //perform real computation for (int j = 0; j < nodes_layer[NUM_LAYER]; j++) { h0[j] = bh[NUM_LAYER][j]; for (int i = 0; i < nodes_layer[NUM_LAYER - 1]; i++) h0[j] = h0[j] + weights[NUM_LAYER][j * nodes_layer[NUM_LAYER - 1] + i] * x0[i]; } for (int j = 0; j < nodes_layer[layer_ind]; j++) sample_mem[layer_ind][j] = h0[j]; GlobalUtil.sigm(x1, bv[layer_ind], weights[NUM_LAYER], h0, nodes_layer[layer_ind], nodes_layer[layer_ind - 1], false);// down sampling for (int j = 0; j < nodes_layer[NUM_LAYER]; j++) { h1[j] = bh[NUM_LAYER][j]; for (int i = 0; i < nodes_layer[NUM_LAYER - 1]; i++) h1[j] = h1[j] + weights[NUM_LAYER][j * nodes_layer[NUM_LAYER - 1] + i] * x1[i]; } for (int j = 0; j < nodes_layer[layer_ind]; j++) for (int i = 0; i < nodes_layer[layer_ind - 1]; i++) { inc_w[j * nodes_layer[layer_ind - 1] + i] = mu * inc_w[j * nodes_layer[layer_ind - 1] + i] + yita_wt * (h0[j] * x0[i] - h1[j] * x1[i] - reg * weights[layer_ind][j * nodes_layer[layer_ind - 1] + i]); weights[layer_ind][j * nodes_layer[layer_ind - 1] + i] = weights[layer_ind][j * nodes_layer[layer_ind - 1] + i] + inc_w[j * nodes_layer[layer_ind - 1] + i]; } for (int j = 0; j < nodes_layer[layer_ind]; j++) { inc_bh[j] = mu * inc_bh[j] + yita_bht * (h0[j] - h1[j] - reg * bh[layer_ind][j]); bh[layer_ind][j] = bh[layer_ind][j] + inc_bh[j]; } for (int i = 0; i < nodes_layer[layer_ind - 1]; i++) { inc_bv[i] = mu * inc_bv[i] + yita_bvt * (x0[i] - x1[i] - reg * bv[layer_ind][i]); bv[layer_ind][i] = bv[layer_ind][i] + inc_bv[i]; } // print the layer input data (just for testing) } } } protected static class MyReducer extends Reducer<Text, ModelNode, Text, ModelNode> { private static final Text result = new Text(); private static final ModelNode model = new ModelNode(); private static final Random rd = new Random(); private static float[][] weights = new float[GlobalUtil.NUM_LAYER + 1][]; //space storing the updating weights (first is not used) private static float[][] bh = new float[GlobalUtil.NUM_LAYER + 1][]; // hidden layer biases (rbm) private static float[][] bv = new float[GlobalUtil.NUM_LAYER + 1][]; // visible layer biases (rbm) private static int NUM_LAYER = GlobalUtil.NUM_LAYER; private static int NODES_INPUT = GlobalUtil.NODES_INPUT; private static final int[] train_len = GlobalUtil.train_len; private static final int[] test_len = GlobalUtil.test_len; private static final int[] nodes_layer = GlobalUtil.nodes_layer; private static float yita_w = GlobalUtil.yita_w, yita_bv = GlobalUtil.yita_bv, yita_bh = GlobalUtil.yita_bh, yita_wt = GlobalUtil.yita_wt, yita_bvt = GlobalUtil.yita_bvt, yita_bht = GlobalUtil.yita_bht; // learning rates private static float mu = GlobalUtil.mu, reg = GlobalUtil.reg; private static int layer_ind = 0; private static int count = 0; public void setup(Context context) throws IOException { // load the information of k clusters layer_ind = context.getConfiguration().getInt("layer_ind", 0); // Initialize the memory for weight parameters for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++) { weights[k] = new float[GlobalUtil.nodes_layer[k - 1] * GlobalUtil.nodes_layer[k]]; bv[k] = new float[GlobalUtil.nodes_layer[k - 1]]; bh[k] = new float[GlobalUtil.nodes_layer[k]]; } count = 0; } public void cleanup(Context context) throws IOException, InterruptedException { result.set("result"); context.write(result, model); } @Override public void reduce(Text key, Iterable<ModelNode> values, Context context) throws IOException, InterruptedException { Iterator<ModelNode> iter = values.iterator(); while (iter.hasNext()) { ModelNode now = iter.next(); combine(model, now); } } void combine(ModelNode model, ModelNode now) { if (count == 0) { model = now; } count++; } } protected static class MyPartitioner extends Partitioner<Text, PairOfStrings> { @Override public int getPartition(Text key, PairOfStrings value, int numReduceTasks) { return (0) % numReduceTasks; } } public AutoCoder() { } private static final String INPUT = "input"; private static final String OUTPUT = "output"; private static final String NUM_REDUCERS = "numReducers"; private static int printUsage() { System.out.println("usage: [input-path] [output-path] [num-reducers]"); ToolRunner.printGenericCommandUsage(System.out); return -1; } /** * Runs this tool. */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath0 = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + AutoCoder.class.getSimpleName()); LOG.info(" - input path: " + inputPath0); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Configuration conf = getConf(); initialParameters(conf); for (int iterations = 1; iterations < GlobalUtil.NUM_LAYER + 1; iterations++) { LOG.info("** Layer: " + iterations); try { Job job = Job.getInstance(conf); job.setJobName(AutoCoder.class.getSimpleName()); job.setJarByClass(AutoCoder.class); // set the path of the information of k clusters in this iteration job.getConfiguration().set("sidepath", inputPath0 + "/side_output"); job.getConfiguration().setInt("layer_ind", iterations); job.setNumReduceTasks(reduceTasks); String inputPath = inputPath0 + "/train"; dataShuffle(); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(ModelNode.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ModelNode.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setPartitionerClass(MyPartitioner.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); prepareNextIteration(inputPath0, outputPath, iterations, conf, reduceTasks); } catch (Exception exp) { exp.printStackTrace(); } } return 0; } /** * Dispatches command-line arguments to the tool via the {@code ToolRunner}. */ public static void main(String[] args) throws Exception { ToolRunner.run(new AutoCoder(), args); } public static void initialParameters(Configuration conf) { } public static void dataShuffle() { } public static void prepareNextIteration(String input, String output, int iterations, Configuration conf, int reduceTasks) { String dstName = input + "/cluster" + iterations; try { FileSystem fs = FileSystem.get(conf); fs.delete(new Path(dstName), true); FSDataOutputStream clusterfile = fs.create(new Path(dstName)); for (int i = 0; i < reduceTasks; i++) { String srcName = output + "/part-r-" + String.format("%05d", i); FSDataInputStream cluster = fs.open(new Path(srcName)); BufferedReader reader = new BufferedReader(new InputStreamReader(cluster)); while (reader.ready()) { String line = reader.readLine() + "\n"; if (line.length() > 5) clusterfile.write(line.getBytes()); } reader.close(); cluster.close(); } clusterfile.flush(); clusterfile.close(); } catch (IOException e) { e.printStackTrace(); } } }