bigsidemodel.AutoCoder.java Source code

Introduction

Here is the source code for bigsidemodel.AutoCoder.java
Source

package bigsidemodel;
/*
 * Cloud9: A Hadoop toolkit for working with big data
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

//package edu.umd.cloud9.example.bigram;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.channels.GatheringByteChannel;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Random;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.Arrays;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.mortbay.log.Log;

import edu.umd.cloud9.io.array.ArrayListOfFloatsWritable;
import edu.umd.cloud9.io.pair.PairOfInts;
import edu.umd.cloud9.io.pair.PairOfStrings;

public class AutoCoder extends Configured implements Tool {
    private static final Logger LOG = Logger.getLogger(AutoCoder.class);

    protected static class MyMapper0 extends Mapper<Text, Text, PairOfInts, DataNode> {
        private static final PairOfInts data_RedID_Type = new PairOfInts();
        private static final DataNode dataNode = new DataNode();

        private static final Random rd = new Random();

        private static int numReducers = 0;
        private int count = 0;

        public void setup(Context context) throws IOException {
            numReducers = context.getConfiguration().getInt("num_reduce_task", 1);
            count = 0;
        }

        @Override
        public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            if (line.length() == 0)
                return;
            StringTokenizer itr = new StringTokenizer(line);
            float[] data = new float[GlobalUtil.NODES_INPUT];

            int tot = 0;
            while (itr.hasMoreTokens()) {
                String curr = itr.nextToken();
                data[tot] = Float.parseFloat(curr) / 255.0f;
                tot++;
            }

            int type = Integer.parseInt(key.toString());

            dataNode.setN(GlobalUtil.NODES_INPUT);
            dataNode.setData(data);

            count++;
            if (type == 0) {
                data_RedID_Type.set(rd.nextInt(numReducers) % numReducers, type);
                context.write(data_RedID_Type, dataNode);
            } else {
                for (int i = 0; i < numReducers; i++) {
                    data_RedID_Type.set(i, type);
                    context.write(data_RedID_Type, dataNode);
                }
            }
        }
    }

    protected static class MyReducer0 extends Reducer<PairOfInts, DataNode, PairOfInts, DataNode> {
        private static final PairOfInts commonID = new PairOfInts();
        private static final DataNode commonData = new DataNode();
        private static final ModelNode model = new ModelNode();

        private static final Random rd = new Random();
        private static float[][] sample_mem = new float[GlobalUtil.NUM_LAYER + 1][]; //space storing the MCMC samples
        private static float[][] weights = new float[GlobalUtil.NUM_LAYER + 1][]; //space storing the updating weights (first is not used)
        private static float[][] bh = new float[GlobalUtil.NUM_LAYER + 1][]; // hidden layer biases (rbm)
        private static float[][] bv = new float[GlobalUtil.NUM_LAYER + 1][]; // visible layer biases (rbm)  
        private static float[][] inc_w = new float[GlobalUtil.NUM_LAYER + 1][];
        private static float[][] inc_bv = new float[GlobalUtil.NUM_LAYER + 1][];
        private static float[][] inc_bh = new float[GlobalUtil.NUM_LAYER + 1][];

        private static int NUM_LAYER = GlobalUtil.NUM_LAYER;
        private static int NODES_INPUT = GlobalUtil.NODES_INPUT;
        private static final int[] train_len = GlobalUtil.train_len;
        private static final int[] test_len = GlobalUtil.test_len;
        private static final int[] nodes_layer = GlobalUtil.nodes_layer;

        private static float yita_w = GlobalUtil.yita_w, yita_bv = GlobalUtil.yita_bv, yita_bh = GlobalUtil.yita_bh,
                yita_wt = GlobalUtil.yita_wt, yita_bvt = GlobalUtil.yita_bvt, yita_bht = GlobalUtil.yita_bht; // learning rates
        private static float mu = GlobalUtil.mu, reg = GlobalUtil.reg;

        private boolean flag = false;
        private int reducerID = 0;
        private int count = 0;

        public void setup(Context context) throws IOException {
            for (int k = 0; k < GlobalUtil.NUM_LAYER + 1; k++) {
                sample_mem[k] = new float[GlobalUtil.nodes_layer[k]];
            }

            for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++) {
                weights[k] = new float[GlobalUtil.nodes_layer[k - 1] * GlobalUtil.nodes_layer[k]];
                inc_w[k] = new float[GlobalUtil.nodes_layer[k - 1] * GlobalUtil.nodes_layer[k]];
                bv[k] = new float[GlobalUtil.nodes_layer[k - 1]];
                inc_bv[k] = new float[GlobalUtil.nodes_layer[k - 1]];
                bh[k] = new float[GlobalUtil.nodes_layer[k]];
                inc_bh[k] = new float[GlobalUtil.nodes_layer[k]];
            }

            for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++)
                for (int j = 0; j < GlobalUtil.nodes_layer[k - 1] * GlobalUtil.nodes_layer[k]; j++)
                    weights[k][j] = 0.1f * (float) rd.nextGaussian();

            for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++)
                for (int j = 0; j < GlobalUtil.nodes_layer[k - 1]; j++)
                    bv[k][j] = 0.0f;

            for (int k = 1; k < GlobalUtil.NUM_LAYER + 1; k++)
                for (int j = 0; j < GlobalUtil.nodes_layer[k]; j++)
                    bh[k][j] = 0.0f;

            for (int k = 1; k <= NUM_LAYER; k++) {
                Arrays.fill(inc_w[k], 0);
                Arrays.fill(inc_bv[k], 0);
                Arrays.fill(inc_bh[k], 0);
            }
            flag = false;
            count = 0;
        }

        public void reduce(PairOfInts key, Iterable<DataNode> values, Context context)
                throws IOException, InterruptedException {
            Iterator<DataNode> iter = values.iterator();
            float[] data;
            reducerID = key.getLeftElement();
            if (key.getRightElement() == 0) {
                while (iter.hasNext()) {
                    DataNode now = iter.next();
                    data = now.getData();

                    for (int i = 0; i < nodes_layer[0]; i++) {
                        sample_mem[0][i] = data[i];
                    }
                    for (int i = 1; i <= GlobalUtil.NUM_LAYER; i++) {
                        work_update(i);
                    }
                }
            } else {
                float[] result;
                if (!flag) {
                    model.setWeight(weights);
                    model.setBV(bv);
                    model.setBH(bh);
                    flag = true;
                }
                while (iter.hasNext()) {
                    DataNode now = iter.next();
                    data = now.getData();
                    result = model.sim(data);

                    count++;
                    commonID.set(key.getRightElement(), key.getLeftElement());
                    commonData.setN(GlobalUtil.nodes_layer[GlobalUtil.NUM_LAYER]);
                    commonData.setData(result);
                    context.write(commonID, commonData);
                }
            }
        }

        public void cleanup(Context context) throws IOException, InterruptedException {
            //output model in sidedata

            Log.info("!!!!!model: " + reducerID + "  " + count);
            String dstName = context.getConfiguration().get("sidepath") + "model" + reducerID;
            try {
                FileSystem fs = FileSystem.get(context.getConfiguration());
                fs.delete(new Path(dstName), true);
                FSDataOutputStream modelfile = fs.create(new Path(dstName));
                model.write(modelfile);
                modelfile.flush();
                modelfile.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        void work_update(int layer_ind) {
            float[] x0 = new float[nodes_layer[layer_ind - 1]]; // data
            float[] h0 = new float[nodes_layer[layer_ind]]; // hidden
            float[] x1 = new float[nodes_layer[layer_ind - 1]];
            float[] h1 = new float[nodes_layer[layer_ind]];

            for (int i = 0; i < nodes_layer[layer_ind - 1]; i++)
                x0[i] = sample_mem[layer_ind - 1][i];

            GlobalUtil.sigm(h0, bh[layer_ind], weights[layer_ind], x0, nodes_layer[layer_ind],
                    nodes_layer[layer_ind - 1], true);// up sampling

            for (int j = 0; j < nodes_layer[layer_ind]; j++)
                sample_mem[layer_ind][j] = h0[j];

            GlobalUtil.sigm(x1, bv[layer_ind], weights[layer_ind], h0, nodes_layer[layer_ind],
                    nodes_layer[layer_ind - 1], false);// down sampling

            GlobalUtil.sigm(h1, bh[layer_ind], weights[layer_ind], x1, nodes_layer[layer_ind],
                    nodes_layer[layer_ind - 1], true);

            for (int j = 0; j < nodes_layer[layer_ind]; j++)
                for (int i = 0; i < nodes_layer[layer_ind - 1]; i++) {
                    inc_w[layer_ind][j * nodes_layer[layer_ind - 1] + i] = mu
                            * inc_w[layer_ind][j * nodes_layer[layer_ind - 1] + i]
                            + yita_w * (h0[j] * x0[i] - h1[j] * x1[i]
                                    - reg * weights[layer_ind][j * nodes_layer[layer_ind - 1] + i]);
                    weights[layer_ind][j * nodes_layer[layer_ind - 1]
                            + i] = weights[layer_ind][j * nodes_layer[layer_ind - 1] + i]
                                    + inc_w[layer_ind][j * nodes_layer[layer_ind - 1] + i];
                }

            for (int j = 0; j < nodes_layer[layer_ind]; j++) {
                inc_bh[layer_ind][j] = mu * inc_bh[layer_ind][j]
                        + yita_bh * (h0[j] - h1[j] - reg * bh[layer_ind][j]);
                bh[layer_ind][j] = bh[layer_ind][j] + inc_bh[layer_ind][j];
            }

            for (int i = 0; i < nodes_layer[layer_ind - 1]; i++) {
                inc_bv[layer_ind][i] = mu * inc_bv[layer_ind][i]
                        + yita_bv * (x0[i] - x1[i] - reg * bv[layer_ind][i]);
                bv[layer_ind][i] = bv[layer_ind][i] + inc_bv[layer_ind][i];
            }
            // print the layer input data (just for testing)
        }
    }

    protected static class MyMapper1 extends Mapper<PairOfInts, DataNode, PairOfInts, DataNode> {
        @Override
        public void map(PairOfInts key, DataNode value, Context context) throws IOException, InterruptedException {
            context.write(key, value);
        }
    }

    protected static class MyReducer1 extends Reducer<PairOfInts, DataNode, NullWritable, NullWritable> {
        private static SuperModel superModel = null;

        private static int count = 0;
        private static int numModel = 0;
        private static int prevID = -1;
        private static int nEach = GlobalUtil.nodes_layer[GlobalUtil.NUM_LAYER];
        private static float[] layer0;

        public void setup(Context context) throws IOException {
            numModel = context.getConfiguration().getInt("num_reduce_task", 1);
            count = 0;
            prevID = -1;
            layer0 = new float[numModel * nEach];
            superModel = new SuperModel(numModel);
        }

        @Override
        public void reduce(PairOfInts key, Iterable<DataNode> values, Context context)
                throws IOException, InterruptedException {
            Iterator<DataNode> iter = values.iterator();

            if (prevID != key.getLeftElement()) {
                if (prevID != -1) {
                    superModel.train(layer0);
                    count++;
                }
                prevID = key.getLeftElement();
            }
            int modelID = key.getRightElement();

            while (iter.hasNext()) {
                DataNode now = iter.next();
                float[] d = now.getData();
                for (int k = 0; k < nEach; k++)
                    layer0[modelID * nEach + k] = d[k];
            }
        }

        @Override
        public void cleanup(Context context) {
            //output model in sidedata
            Log.info("!!!!!totoal: " + count);
            String dstName = context.getConfiguration().get("sidepath") + "super";
            try {
                FileSystem fs = FileSystem.get(context.getConfiguration());
                fs.delete(new Path(dstName), true);
                FSDataOutputStream modelfile = fs.create(new Path(dstName));
                superModel.write(modelfile);
                modelfile.flush();
                modelfile.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

    }

    protected static class MyPartitioner extends Partitioner<PairOfInts, DataNode> {
        @Override
        public int getPartition(PairOfInts key, DataNode value, int numReduceTasks) {
            return (key.getLeftElement()) % numReduceTasks;
        }
    }

    public AutoCoder() {
    }

    private static final String INPUT = "input";
    private static final String OUTPUT = "output";
    private static final String NUM_REDUCERS = "numReducers";

    private static int printUsage() {
        System.out.println("usage: [input-path] [output-path] [num-reducers]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    /**
     * Runs this tool.
     */
    @SuppressWarnings({ "static-access" })
    public int run(String[] args) throws Exception {
        Options options = new Options();

        options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
        options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
        options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
                .create(NUM_REDUCERS));

        CommandLine cmdline;
        CommandLineParser parser = new GnuParser();

        try {
            cmdline = parser.parse(options, args);
        } catch (ParseException exp) {
            System.err.println("Error parsing command line: " + exp.getMessage());
            return -1;
        }

        /*if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
           System.out.println("args: " + Arrays.toString(args));
           HelpFormatter formatter = new HelpFormatter();
           formatter.setWidth(120);
           formatter.printHelp(this.getClass().getName(), options);
           ToolRunner.printGenericCommandUsage(System.out);
           return -1;
        }*/

        //String inputPath = cmdline.getOptionValue(INPUT);
        //String outputPath = cmdline.getOptionValue(OUTPUT);

        String inputPath = "qiwang321/best5-mingled-key-56x56/part*";
        String outputPath = "shangfu/bigoutput";
        int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
                : 1;

        LOG.info("Tool: " + AutoCoder.class.getSimpleName());
        LOG.info(" - input path: " + inputPath);
        LOG.info(" - output path: " + outputPath + "0");
        LOG.info(" - number of reducers: " + reduceTasks);
        Configuration conf = getConf();
        conf.setInt("num_reduce_task", reduceTasks);
        conf.set("sidepath", outputPath + "_side/");

        Job job0 = Job.getInstance(conf);
        job0.setJobName(AutoCoder.class.getSimpleName());
        job0.setJarByClass(AutoCoder.class);
        // set the path of the information of k clusters in this iteration
        job0.setNumReduceTasks(reduceTasks);

        FileInputFormat.setInputPaths(job0, new Path(inputPath));
        FileOutputFormat.setOutputPath(job0, new Path(outputPath + "0"));

        job0.setInputFormatClass(KeyValueTextInputFormat.class);
        job0.setOutputFormatClass(SequenceFileOutputFormat.class);

        job0.setMapOutputKeyClass(PairOfInts.class);
        job0.setMapOutputValueClass(DataNode.class);
        job0.setOutputKeyClass(PairOfInts.class);
        job0.setOutputValueClass(DataNode.class);

        job0.setMapperClass(MyMapper0.class);
        job0.setReducerClass(MyReducer0.class);
        job0.setPartitionerClass(MyPartitioner.class);

        // Delete the output directory if it exists already.
        Path outputDir = new Path(outputPath + "0");
        FileSystem.get(getConf()).delete(outputDir, true);

        long codeStart = System.currentTimeMillis();
        double jobTimeSum = 0;

        long startTime = System.currentTimeMillis();
        job0.waitForCompletion(true);
        LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        jobTimeSum += (System.currentTimeMillis() - startTime) / 1000.0;

        //======= Job 1
        LOG.info("Tool: " + AutoCoder.class.getSimpleName());
        LOG.info(" - input path: " + outputPath + "0");
        LOG.info(" - output path: " + outputPath + "1");
        LOG.info(" - number of reducers: " + 1);
        int nModel = reduceTasks;
        reduceTasks = 1;

        Job job1 = Job.getInstance(conf);
        job1.setJobName(AutoCoder.class.getSimpleName());
        job1.setJarByClass(AutoCoder.class);
        // set the path of the information of k clusters in this iteration
        job1.setNumReduceTasks(reduceTasks);

        FileInputFormat.setInputPaths(job1, new Path(outputPath + "0"));
        FileOutputFormat.setOutputPath(job1, new Path(outputPath + "1"));

        job1.setInputFormatClass(SequenceFileInputFormat.class);
        job1.setOutputFormatClass(SequenceFileOutputFormat.class);

        job1.setMapOutputKeyClass(PairOfInts.class);
        job1.setMapOutputValueClass(DataNode.class);
        job1.setOutputKeyClass(NullWritable.class);
        job1.setOutputValueClass(NullWritable.class);

        job1.setMapperClass(MyMapper1.class);
        job1.setReducerClass(MyReducer1.class);
        job1.setPartitionerClass(MyPartitioner.class);

        // Delete the output directory if it exists already.
        outputDir = new Path(outputPath + "1");
        FileSystem.get(getConf()).delete(outputDir, true);

        startTime = System.currentTimeMillis();
        job1.waitForCompletion(true);
        LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
        jobTimeSum += (System.currentTimeMillis() - startTime) / 1000.0;
        LOG.info("Final Time: " + ((System.currentTimeMillis() - codeStart) / 1000.0) + " seconds,  " + jobTimeSum
                + " seconds.");

        return 0;
    }

    /**
     * Dispatches command-line arguments to the tool via the {@code ToolRunner}.
     */
    public static void main(String[] args) throws Exception {
        ToolRunner.run(new AutoCoder(), args);
    }
}