org.avenir.tree.DataPartitioner.java Source code

Introduction

Here is the source code for org.avenir.tree.DataPartitioner.java
Source

/*
 * avenir: Predictive analytic based on Hadoop Map Reduce
 * Author: Pranab Ghosh
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package org.avenir.tree;

import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.avenir.util.AttributeSplitHandler;
import org.chombo.mr.FeatureField;
import org.chombo.mr.FeatureSchema;
import org.chombo.util.SecondarySort;
import org.chombo.util.Utility;
import org.codehaus.jackson.map.ObjectMapper;

/**
 * Partitions data based on a split selected among some candidate splits
 * generated from the parent node and corresponding data
 * @author pranab
 *
 */
public class DataPartitioner extends Configured implements Tool {
    private static final Logger LOG = Logger.getLogger(DataPartitioner.class);
    private boolean debugOn;

    @Override
    public int run(String[] args) throws Exception {
        Job job = new Job(getConf());
        String jobName = "Partitions data by some split";
        job.setJobName(jobName);

        job.setJarByClass(DataPartitioner.class);

        Utility.setConfiguration(job.getConfiguration(), "avenir");
        debugOn = job.getConfiguration().getBoolean("debug.on", false);
        if (debugOn) {
            LOG.setLevel(Level.DEBUG);
        }

        job.setMapperClass(DataPartitioner.PartitionerMapper.class);
        job.setReducerClass(DataPartitioner.PartitionerReducer.class);

        //find best split and create output path
        String inPath = getNodePath(job);
        if (debugOn)
            System.out.println("inPath:" + inPath);
        Split split = findBestSplitKey(job, inPath);
        String outPath = inPath + "/" + "split=" + split.getIndex();
        if (debugOn)
            System.out.println("outPath:" + outPath);

        FileInputFormat.addInputPath(job, new Path(inPath));
        FileOutputFormat.setOutputPath(job, new Path(outPath));

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        job.setPartitionerClass(SecondarySort.RawIntKeyTextPartitioner.class);
        int numReducers = split.getSegmentCount();
        if (debugOn)
            System.out.println("numReducers:" + numReducers);
        job.setNumReduceTasks(numReducers);

        int status = job.waitForCompletion(true) ? 0 : 1;
        //move output to segment directories
        if (status == 0) {
            moveOutputToSegmentDir(outPath, split.getSegmentCount(), job.getConfiguration());
        }
        return status;
    }

    /**
     * @param outPath
     * @param segmentCount
     * @param conf
     * @throws IOException
     */
    private void moveOutputToSegmentDir(String outPath, int segmentCount, Configuration conf) throws IOException {
        FileSystem fileSystem = FileSystem.get(conf);
        for (int i = 0; i < segmentCount; ++i) {
            //create segment dir
            String dir = outPath + "/segment=" + i + "/data";
            Path segmentPath = new Path(dir);
            fileSystem.mkdirs(segmentPath);

            //move output to segment dir
            Path srcFile = new Path(outPath + "/part-r-0000" + i);
            Path dstFile = new Path(outPath + "/segment=" + i + "/data/partition.txt");
            fileSystem.rename(srcFile, dstFile);
        }

        fileSystem.close();
    }

    /**
     * @param job
     * @return
     */
    private String getNodePath(Job job) {
        String nodePath = null;
        Configuration conf = job.getConfiguration();
        String basePath = conf.get("project.base.path");
        if (Utility.isBlank(basePath)) {
            throw new IllegalStateException("base path not defined");
        }
        String splitPath = conf.get("split.path");
        if (debugOn)
            System.out.println("basePath:" + basePath + " splitPath:" + splitPath);
        nodePath = Utility.isBlank(splitPath) ? basePath + "/split=root/data"
                : basePath + "/split=root/data/" + splitPath;
        return nodePath;
    }

    /**
     * Finds best split according to chosen strategy
     * @param job
     * @param iplutPath
     * @return
     * @throws IOException
     */
    private Split findBestSplitKey(Job job, String inputPath) throws IOException {
        String splitKey = null;
        Configuration conf = job.getConfiguration();
        String splitSelectionStrategy = conf.get("split.selection.strategy", "best");

        String candidateSplitsPath = Utility.getSiblingPath(inputPath, "splits/part-r-00000");
        if (debugOn)
            System.out.println("candidateSplitsPath:" + candidateSplitsPath);
        conf.set("candidate.splits.path", candidateSplitsPath);
        List<String> lines = Utility.getFileLines(conf, "candidate.splits.path");

        //create split objects and sort
        Split[] splits = new Split[lines.size()];
        int i = 0;
        for (String line : lines) {
            splits[i] = new Split(line, i);
            ++i;
        }

        //sort splits
        Arrays.sort(splits);

        //find split
        int splitIndex = 0;
        if (splitSelectionStrategy.equals("best")) {
        } else if (splitSelectionStrategy.equals("randomFromTop")) {
            int numSplits = conf.getInt("num.top.splits", 5);
            splitIndex = (int) (Math.random() * numSplits);
        }
        Split split = splits[splitIndex];

        //set asplit attribute ordinal and split key
        int splitAttribute = split.getAttributeOrdinal();
        conf.setInt("split.attribute", splitAttribute);
        if (debugOn)
            System.out.println("splitAttribute:" + splitAttribute);
        splitKey = split.getSplitKey();
        if (debugOn)
            System.out.println("splitKey:" + splitKey);
        conf.set("split.key", splitKey);

        return split;
    }

    /**
     * Sortable split
     * @author pranab
     *
     */
    private static class Split implements Comparable<Split> {
        private String line;
        private int index;
        private String[] items;

        public Split(String line, int index) {
            this.line = line;
            this.index = index;
            items = line.split(";");
        }

        @Override
        public int compareTo(Split that) {
            double thisVal = Double.parseDouble(items[2]);
            double thatVal = Double.parseDouble(that.items[2]);

            //descending order
            return thisVal > thatVal ? -1 : (thisVal < thatVal ? 1 : 0);
        }

        /**
         * Split segment
         * @return
         */
        public String getSplitKey() {
            return items[1];
        }

        /**
         * Split segment
         * @return
         */
        public String getNormalizedSplitKey() {
            String key = items[1].replaceAll("\\s+", "");
            key = key.replaceAll("\\[", "");
            key = key.replaceAll("\\]", "");
            key = key.replaceAll(":", "-");
            return key;
        }

        /**
         * Split attribute ordinal
         * @return
         */
        public int getAttributeOrdinal() {
            return Integer.parseInt(items[0]);
        }

        /**
         * Number of segments in the split
         * @return
         */
        public int getSegmentCount() {
            String[] segments = items[1].split(":");
            return segments.length;
        }

        public String getLine() {
            return line;
        }

        public int getIndex() {
            return index;
        }
    }

    /**
     * @author pranab
     *
     */
    public static class PartitionerMapper extends Mapper<LongWritable, Text, IntWritable, Text> {
        private String fieldDelimRegex;
        private String[] items;
        private IntWritable outKey = new IntWritable();
        private Text outVal = new Text();
        private FeatureSchema schema;
        private int splitAttrOrd;
        private FeatureField featureField;
        private AttributeSplitHandler.Split split;
        private int splitSegment;
        private String attrVal;

        private static final Logger LOG = Logger.getLogger(PartitionerMapper.class);

        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
         */
        protected void setup(Context context) throws IOException, InterruptedException {
            Configuration conf = context.getConfiguration();
            if (conf.getBoolean("debug.on", false)) {
                LOG.setLevel(Level.DEBUG);
            }
            fieldDelimRegex = conf.get("field.delim.regex", ",");

            splitAttrOrd = conf.getInt("split.attribute", -1);
            if (splitAttrOrd == -1) {
                throw new IOException("split attribute not found");
            }
            LOG.debug("splitAttrOrd:" + splitAttrOrd);
            String splitKey = conf.get("split.key");
            LOG.debug("splitKey:" + splitKey);

            InputStream fs = Utility.getFileStream(context.getConfiguration(), "feature.schema.file.path");
            ObjectMapper mapper = new ObjectMapper();
            schema = mapper.readValue(fs, FeatureSchema.class);
            featureField = schema.findFieldByOrdinal(splitAttrOrd);
            if (featureField.isInteger()) {
                split = new AttributeSplitHandler.IntegerSplit(splitKey);
            } else if (featureField.isCategorical()) {
                split = new AttributeSplitHandler.CategoricalSplit(splitKey);
            }
            split.fromString();

        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            items = value.toString().split(fieldDelimRegex);

            //key is split segment
            attrVal = items[splitAttrOrd];
            splitSegment = split.getSegmentIndex(attrVal);
            LOG.debug("splitSegment:" + splitSegment);

            outKey.set(splitSegment);

            context.write(outKey, value);
        }
    }

    /**
     * @author pranab
     *
     */
    public static class PartitionerReducer extends Reducer<IntWritable, Text, NullWritable, Text> {

        /* (non-Javadoc)
         * @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
         */
        protected void reduce(IntWritable key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            for (Text value : values) {
                context.write(NullWritable.get(), value);
            }
        }
    }

    /**
     * @param args
     */
    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new DataPartitioner(), args);
        System.exit(exitCode);
    }

}