layer.AutoCoder.java Source code

Introduction

Here is the source code for layer.AutoCoder.java
Source

package layer;
/*
 * Cloud9: A Hadoop toolkit for working with big data
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

//package edu.umd.cloud9.example.bigram;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.channels.GatheringByteChannel;
import java.util.Iterator;
import java.util.Random;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.Arrays;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.mortbay.log.Log;

import edu.umd.cloud9.io.array.ArrayListOfFloatsWritable;
import edu.umd.cloud9.io.pair.PairOfStrings;

public class AutoCoder extends Configured implements Tool {
    private static final Logger LOG = Logger.getLogger(AutoCoder.class);

    protected static class MyMapper extends Mapper<LongWritable, Text, Text, ModelNode> {
        private static final Text comp = new Text();
        private static final ModelNode model = new ModelNode();
        private static int num_train_data = 0;

        private static final Random rd = new Random();
        private static float[][] sample_mem = new float[GlobalUtil.NUM_LAYER + 1][]; //space storing the MCMC samples
        private static float[][] weights = new float[GlobalUtil.NUM_LAYER + 1][]; //space storing the updating weights (first is not used)
        private static float[][] bh = new float[GlobalUtil.NUM_LAYER + 1][]; // hidden layer biases (rbm)
        private static float[][] bv = new float[GlobalUtil.NUM_LAYER + 1][]; // visible layer biases (rbm)  

        private static int NUM_LAYER = GlobalUtil.NUM_LAYER;
        private static int NODES_INPUT = GlobalUtil.NODES_INPUT;
        private static final int[] train_len = GlobalUtil.train_len;
        private static final int[] test_len = GlobalUtil.test_len;
        private static final int[] nodes_layer = GlobalUtil.nodes_layer;

        private static float yita_w = GlobalUtil.yita_w, yita_bv = GlobalUtil.yita_bv, yita_bh = GlobalUtil.yita_bh,
                yita_wt = GlobalUtil.yita_wt, yita_bvt = GlobalUtil.yita_bvt, yita_bht = GlobalUtil.yita_bht; // learning rates
        private static float mu = GlobalUtil.mu, reg = GlobalUtil.reg;
        private static int layer_ind = 0;

        private static float read_float(BufferedReader reader) throws NumberFormatException, IOException {
            while (reader.ready()) {
                String line = reader.readLine();
                if (line.length() == 0)
                    continue;
                return Float.parseFloat(line);
            }
            return 0;
        }

        public void setup(Context context) throws IOException {
            // load the information of k clusters 
            layer_ind = context.getConfiguration().getInt("layer_ind", 0);
            String file = context.getConfiguration().get("sidepath");
            FSDataInputStream cluster = FileSystem.get(context.getConfiguration()).open(new Path(file));
            BufferedReader reader = new BufferedReader(new InputStreamReader(cluster));

            // Initialize the  memory for MCMC samples
            for (int k = 0; k < GlobalUtil.NUM_LAYER + 1; k++) {
                sample_mem[k] = new float[GlobalUtil.nodes_layer[k]];
            }

            // Initialize the  memory for weight parameters
            for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++) {
                weights[k] = new float[GlobalUtil.nodes_layer[k - 1] * GlobalUtil.nodes_layer[k]];
                bv[k] = new float[GlobalUtil.nodes_layer[k - 1]];
                bh[k] = new float[GlobalUtil.nodes_layer[k]];
            }

            /*      for (int k = 0; k < GlobalUtil.NUM_LAYER + 1; k++) 
                    for (int j = 0; j < GlobalUtil.nodes_layer[k]; j++)
                      sample_mem[k][j]=read_float(reader);
              */

            for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++)
                for (int j = 0; j < GlobalUtil.nodes_layer[k - 1] * GlobalUtil.nodes_layer[k]; j++)
                    weights[k][j] = read_float(reader);

            for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++)
                for (int j = 0; j < GlobalUtil.nodes_layer[k - 1]; j++)
                    bv[k][j] = read_float(reader);

            for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++)
                for (int j = 0; j < GlobalUtil.nodes_layer[k]; j++)
                    bh[k][j] = read_float(reader);

            reader.close();
            cluster.close();

            num_train_data = 0;
        }

        @Override
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            StringTokenizer itr = new StringTokenizer(line);
            float[] data = new float[NODES_INPUT];

            int tot = 0;
            while (itr.hasMoreTokens()) {
                String curr = itr.nextToken();
                data[tot] = Float.parseFloat(curr);
                tot++;
            }

            for (int i = 0; i < nodes_layer[layer_ind - 1]; i++) {
                sample_mem[layer_ind - 1][i] = data[i];
            }
            num_train_data++;
            work_update();
        }

        public void cleanup(Context context) throws IOException, InterruptedException {
            comp.set(String.valueOf(num_train_data));

            ArrayListOfFloatsWritable[] W = new ArrayListOfFloatsWritable[GlobalUtil.NUM_LAYER + 1];
            ArrayListOfFloatsWritable[] BV = new ArrayListOfFloatsWritable[GlobalUtil.NUM_LAYER + 1];
            ArrayListOfFloatsWritable[] BH = new ArrayListOfFloatsWritable[GlobalUtil.NUM_LAYER + 1];

            for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++) {
                W[k] = new ArrayListOfFloatsWritable(weights[k]);
                BV[k] = new ArrayListOfFloatsWritable(bv[k]);
                BH[k] = new ArrayListOfFloatsWritable(bh[k]);
            }
            model.setWeight(W);
            model.setBV(BV);
            model.setBH(BH);
            context.write(comp, model);
        }

        void work_update() {
            float[] x0 = new float[nodes_layer[layer_ind - 1]]; // data
            float[] h0 = new float[nodes_layer[layer_ind]]; // hidden
            float[] x1 = new float[nodes_layer[layer_ind - 1]];
            float[] h1 = new float[nodes_layer[layer_ind]];
            float[] inc_w = new float[nodes_layer[layer_ind - 1] * nodes_layer[layer_ind]]; // previous increase of weights
            float[] inc_bv = new float[nodes_layer[layer_ind - 1]];
            float[] inc_bh = new float[nodes_layer[layer_ind]];
            Arrays.fill(inc_w, 0);
            Arrays.fill(inc_bv, 0);
            Arrays.fill(inc_bh, 0);

            for (int i = 0; i < nodes_layer[layer_ind - 1]; i++)
                x0[i] = sample_mem[layer_ind - 1][i];

            if (layer_ind != NUM_LAYER) { // normal layer        

                //perform real computation
                GlobalUtil.sigm(h0, bh[layer_ind], weights[layer_ind], x0, nodes_layer[layer_ind],
                        nodes_layer[layer_ind - 1], true);// up sampling

                for (int j = 0; j < nodes_layer[layer_ind]; j++)
                    sample_mem[layer_ind][j] = h0[j];

                for (int i = 0; i < nodes_layer[layer_ind]; i++) {
                    if (rd.nextFloat() < h0[i])
                        h0[i] = 1;
                    else
                        h0[i] = 0;
                }

                GlobalUtil.sigm(x1, bv[layer_ind], weights[layer_ind], h0, nodes_layer[layer_ind],
                        nodes_layer[layer_ind - 1], false);// down sampling

                GlobalUtil.sigm(h1, bh[layer_ind], weights[layer_ind], x1, nodes_layer[layer_ind],
                        nodes_layer[layer_ind - 1], true);

                for (int j = 0; j < nodes_layer[layer_ind]; j++)
                    for (int i = 0; i < nodes_layer[layer_ind - 1]; i++) {
                        inc_w[j * nodes_layer[layer_ind - 1] + i] = mu * inc_w[j * nodes_layer[layer_ind - 1] + i]
                                + yita_w * (h0[j] * x0[i] - h1[j] * x1[i]
                                        - reg * weights[layer_ind][j * nodes_layer[layer_ind - 1] + i]);
                        weights[layer_ind][j * nodes_layer[layer_ind - 1]
                                + i] = weights[layer_ind][j * nodes_layer[layer_ind - 1] + i]
                                        + inc_w[j * nodes_layer[layer_ind - 1] + i];
                    }

                for (int j = 0; j < nodes_layer[layer_ind]; j++) {
                    inc_bh[j] = mu * inc_bh[j] + yita_bh * (h0[j] - h1[j] - reg * bh[layer_ind][j]);
                    bh[layer_ind][j] = bh[layer_ind][j] + inc_bh[j];
                }

                for (int i = 0; i < nodes_layer[layer_ind - 1]; i++) {
                    inc_bv[i] = mu * inc_bv[i] + yita_bv * (x0[i] - x1[i] - reg * bv[layer_ind][i]);
                    bv[layer_ind][i] = bv[layer_ind][i] + inc_bv[i];
                }
                // print the layer input data (just for testing)
            } else { // top layer
                     //perform real computation
                for (int j = 0; j < nodes_layer[NUM_LAYER]; j++) {
                    h0[j] = bh[NUM_LAYER][j];
                    for (int i = 0; i < nodes_layer[NUM_LAYER - 1]; i++)
                        h0[j] = h0[j] + weights[NUM_LAYER][j * nodes_layer[NUM_LAYER - 1] + i] * x0[i];
                }

                for (int j = 0; j < nodes_layer[layer_ind]; j++)
                    sample_mem[layer_ind][j] = h0[j];

                GlobalUtil.sigm(x1, bv[layer_ind], weights[NUM_LAYER], h0, nodes_layer[layer_ind],
                        nodes_layer[layer_ind - 1], false);// down sampling

                for (int j = 0; j < nodes_layer[NUM_LAYER]; j++) {
                    h1[j] = bh[NUM_LAYER][j];
                    for (int i = 0; i < nodes_layer[NUM_LAYER - 1]; i++)
                        h1[j] = h1[j] + weights[NUM_LAYER][j * nodes_layer[NUM_LAYER - 1] + i] * x1[i];
                }

                for (int j = 0; j < nodes_layer[layer_ind]; j++)
                    for (int i = 0; i < nodes_layer[layer_ind - 1]; i++) {
                        inc_w[j * nodes_layer[layer_ind - 1] + i] = mu * inc_w[j * nodes_layer[layer_ind - 1] + i]
                                + yita_wt * (h0[j] * x0[i] - h1[j] * x1[i]
                                        - reg * weights[layer_ind][j * nodes_layer[layer_ind - 1] + i]);
                        weights[layer_ind][j * nodes_layer[layer_ind - 1]
                                + i] = weights[layer_ind][j * nodes_layer[layer_ind - 1] + i]
                                        + inc_w[j * nodes_layer[layer_ind - 1] + i];
                    }

                for (int j = 0; j < nodes_layer[layer_ind]; j++) {
                    inc_bh[j] = mu * inc_bh[j] + yita_bht * (h0[j] - h1[j] - reg * bh[layer_ind][j]);
                    bh[layer_ind][j] = bh[layer_ind][j] + inc_bh[j];
                }

                for (int i = 0; i < nodes_layer[layer_ind - 1]; i++) {
                    inc_bv[i] = mu * inc_bv[i] + yita_bvt * (x0[i] - x1[i] - reg * bv[layer_ind][i]);
                    bv[layer_ind][i] = bv[layer_ind][i] + inc_bv[i];
                }
                // print the layer input data (just for testing)
            }
        }
    }

    protected static class MyReducer extends Reducer<Text, ModelNode, Text, ModelNode> {
        private static final Text result = new Text();
        private static final ModelNode model = new ModelNode();

        private static final Random rd = new Random();
        private static float[][] weights = new float[GlobalUtil.NUM_LAYER + 1][]; //space storing the updating weights (first is not used)
        private static float[][] bh = new float[GlobalUtil.NUM_LAYER + 1][]; // hidden layer biases (rbm)
        private static float[][] bv = new float[GlobalUtil.NUM_LAYER + 1][]; // visible layer biases (rbm)  

        private static int NUM_LAYER = GlobalUtil.NUM_LAYER;
        private static int NODES_INPUT = GlobalUtil.NODES_INPUT;
        private static final int[] train_len = GlobalUtil.train_len;
        private static final int[] test_len = GlobalUtil.test_len;
        private static final int[] nodes_layer = GlobalUtil.nodes_layer;

        private static float yita_w = GlobalUtil.yita_w, yita_bv = GlobalUtil.yita_bv, yita_bh = GlobalUtil.yita_bh,
                yita_wt = GlobalUtil.yita_wt, yita_bvt = GlobalUtil.yita_bvt, yita_bht = GlobalUtil.yita_bht; // learning rates
        private static float mu = GlobalUtil.mu, reg = GlobalUtil.reg;
        private static int layer_ind = 0;

        private static int count = 0;

        public void setup(Context context) throws IOException {
            // load the information of k clusters 
            layer_ind = context.getConfiguration().getInt("layer_ind", 0);

            // Initialize the  memory for weight parameters
            for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++) {
                weights[k] = new float[GlobalUtil.nodes_layer[k - 1] * GlobalUtil.nodes_layer[k]];
                bv[k] = new float[GlobalUtil.nodes_layer[k - 1]];
                bh[k] = new float[GlobalUtil.nodes_layer[k]];
            }
            count = 0;
        }

        public void cleanup(Context context) throws IOException, InterruptedException {
            result.set("result");
            context.write(result, model);
        }

        @Override
        public void reduce(Text key, Iterable<ModelNode> values, Context context)
                throws IOException, InterruptedException {
            Iterator<ModelNode> iter = values.iterator();

            while (iter.hasNext()) {
                ModelNode now = iter.next();
                combine(model, now);
            }
        }

        void combine(ModelNode model, ModelNode now) {
            if (count == 0) {
                model = now;
            }
            count++;
        }

    }

    protected static class MyPartitioner extends Partitioner<Text, PairOfStrings> {
        @Override
        public int getPartition(Text key, PairOfStrings value, int numReduceTasks) {
            return (0) % numReduceTasks;
        }
    }

    public AutoCoder() {
    }

    private static final String INPUT = "input";
    private static final String OUTPUT = "output";
    private static final String NUM_REDUCERS = "numReducers";

    private static int printUsage() {
        System.out.println("usage: [input-path] [output-path] [num-reducers]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    /**
     * Runs this tool.
     */
    @SuppressWarnings({ "static-access" })
    public int run(String[] args) throws Exception {
        Options options = new Options();

        options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
        options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
        options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
                .create(NUM_REDUCERS));

        CommandLine cmdline;
        CommandLineParser parser = new GnuParser();

        try {
            cmdline = parser.parse(options, args);
        } catch (ParseException exp) {
            System.err.println("Error parsing command line: " + exp.getMessage());
            return -1;
        }

        if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
            System.out.println("args: " + Arrays.toString(args));
            HelpFormatter formatter = new HelpFormatter();
            formatter.setWidth(120);
            formatter.printHelp(this.getClass().getName(), options);
            ToolRunner.printGenericCommandUsage(System.out);
            return -1;
        }

        String inputPath0 = cmdline.getOptionValue(INPUT);
        String outputPath = cmdline.getOptionValue(OUTPUT);
        int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
                : 1;

        LOG.info("Tool: " + AutoCoder.class.getSimpleName());
        LOG.info(" - input path: " + inputPath0);
        LOG.info(" - output path: " + outputPath);
        LOG.info(" - number of reducers: " + reduceTasks);

        Configuration conf = getConf();
        initialParameters(conf);

        for (int iterations = 1; iterations < GlobalUtil.NUM_LAYER + 1; iterations++) {
            LOG.info("** Layer: " + iterations);
            try {

                Job job = Job.getInstance(conf);
                job.setJobName(AutoCoder.class.getSimpleName());
                job.setJarByClass(AutoCoder.class);
                // set the path of the information of k clusters in this iteration
                job.getConfiguration().set("sidepath", inputPath0 + "/side_output");
                job.getConfiguration().setInt("layer_ind", iterations);
                job.setNumReduceTasks(reduceTasks);

                String inputPath = inputPath0 + "/train";
                dataShuffle();

                FileInputFormat.setInputPaths(job, new Path(inputPath));
                FileOutputFormat.setOutputPath(job, new Path(outputPath));

                job.setMapOutputKeyClass(Text.class);
                job.setMapOutputValueClass(ModelNode.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(ModelNode.class);

                job.setMapperClass(MyMapper.class);
                job.setReducerClass(MyReducer.class);
                job.setPartitionerClass(MyPartitioner.class);

                // Delete the output directory if it exists already.
                Path outputDir = new Path(outputPath);
                FileSystem.get(getConf()).delete(outputDir, true);

                long startTime = System.currentTimeMillis();
                job.waitForCompletion(true);
                LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

                prepareNextIteration(inputPath0, outputPath, iterations, conf, reduceTasks);
            } catch (Exception exp) {
                exp.printStackTrace();
            }
        }

        return 0;
    }

    /**
     * Dispatches command-line arguments to the tool via the {@code ToolRunner}.
     */
    public static void main(String[] args) throws Exception {
        ToolRunner.run(new AutoCoder(), args);
    }

    public static void initialParameters(Configuration conf) {

    }

    public static void dataShuffle() {

    }

    public static void prepareNextIteration(String input, String output, int iterations, Configuration conf,
            int reduceTasks) {
        String dstName = input + "/cluster" + iterations;
        try {
            FileSystem fs = FileSystem.get(conf);
            fs.delete(new Path(dstName), true);
            FSDataOutputStream clusterfile = fs.create(new Path(dstName));

            for (int i = 0; i < reduceTasks; i++) {
                String srcName = output + "/part-r-" + String.format("%05d", i);
                FSDataInputStream cluster = fs.open(new Path(srcName));
                BufferedReader reader = new BufferedReader(new InputStreamReader(cluster));
                while (reader.ready()) {
                    String line = reader.readLine() + "\n";
                    if (line.length() > 5)
                        clusterfile.write(line.getBytes());
                }
                reader.close();
                cluster.close();
            }
            clusterfile.flush();
            clusterfile.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}