com.ostor.dedup.hadoop.DedupStorHadoopCreateObjectsMapReduce.java Source code

Introduction

Here is the source code for com.ostor.dedup.hadoop.DedupStorHadoopCreateObjectsMapReduce.java
Source

/*
 * Created on Oct 29, 2009
 * Created by Praveen Patnala
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 *
 */

package com.ostor.dedup.hadoop;

import java.io.IOException;
import java.util.*;

import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;

import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;

import com.ostor.dedup.core.*;

public class DedupStorHadoopCreateObjectsMapReduce {
    private static Logger logger = Logger.getLogger(DedupStorHadoopCreateObjectsMapReduce.class.getName());

    public static class DedupStorHadoopCreateObjectsMapper extends MapReduceBase
            implements Mapper<LongWritable, Text, Text, DedupObjectSegmentWritable> {
        private static String inputSplitToken = null;
        private static int numInputTokens = 2;

        public void configure(JobConf conf) {
            PropertyConfigurator.configure(DedupStor.DEFAULT_LOG4J_FILE);
            logger.info("In mapper.configure()");
            inputSplitToken = conf.get("mapred.textoutputformat.separator", "\t");
            logger.info("Input split token - " + inputSplitToken);
        }

        public void map(LongWritable key, Text value, OutputCollector<Text, DedupObjectSegmentWritable> output,
                Reporter reporter) throws IOException {
            String input = value.toString();

            logger.debug("Mapper.map - input - " + input);

            String[] tokens = input.split(inputSplitToken);

            if (tokens.length != numInputTokens) {
                logger.error("Input line invalid - " + input);
                throw new IOException("Input line invalid - " + input);
            }

            String objName = tokens[0];

            DedupObjectSegment objSeg = new DedupObjectSegment();

            try {
                objSeg.restoreFromString(tokens[1]);
            }

            catch (Exception e) {
                e.printStackTrace();
                logger.error("Couldn't create object segment from input - " + input);
                throw new IOException("Couldn't create object segment from " + "input - " + input);
            }

            Text outputKey = new Text(objName);

            DedupObjectSegmentWritable outputValue = new DedupObjectSegmentWritable(objSeg);

            logger.debug("Emit output - " + outputKey + " with value - " + outputValue);

            output.collect(outputKey, outputValue);
        }
    }

    public static class DedupStorHadoopCreateObjectsReducer extends MapReduceBase
            implements Reducer<Text, DedupObjectSegmentWritable, Text, Text> {
        private Text successText = new Text("success");
        private Path objectStorPath;
        private FileSystem fs;

        public void configure(JobConf conf) {
            try {
                PropertyConfigurator.configure(DedupStor.DEFAULT_LOG4J_FILE);
                fs = FileSystem.get(conf);

                objectStorPath = new Path(conf.get(DedupStorHadoopUtils.HADOOP_CONF_OBJECTS_STOR_PATH_KEY));
            }

            catch (Exception e) {
                e.printStackTrace();
                logger.error("Caught exception in reducer.configure()");
            }

            logger.info("In reducer.configure() - object stor path - " + objectStorPath);
        }

        public void reduce(Text key, Iterator<DedupObjectSegmentWritable> values,
                OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
            String objName = key.toString();

            logger.debug("Process segmnts for object - " + objName);

            List<DedupObjectSegment> seglist = new ArrayList<DedupObjectSegment>();

            while (values.hasNext()) {
                DedupObjectSegmentWritable value = values.next();

                DedupObjectSegment objseg = (DedupObjectSegment) value.get();

                logger.debug("Process objseg - " + objseg);

                seglist.add(objseg);
            }

            DedupHadoopObject obj = null;

            try {
                obj = new DedupHadoopObject(objName, seglist, new DedupSegmentStor());
            }

            catch (Exception e) {
                e.printStackTrace();
                obj = null;
                logger.error("Exception while allocating object");
            }

            if (obj == null) {
                logger.error(
                        "Couldn't create object - " + obj + " from list of segments of size - " + seglist.size());
                throw new IOException(
                        "Couldn't create object - " + obj + " from list of segments of size - " + seglist.size());
            }

            logger.debug("Created dedup object - " + obj + " write now ");

            try {
                // NOTE -- should be a better way to do this?
                // write dedup object to HDFS
                obj.dumpToHDFS(fs, new Path(objectStorPath, obj.getSerializedName()));
            }

            catch (Exception e) {
                e.printStackTrace();
                logger.error("Couldn't write object to hdfs - " + obj + " exception - " + e);
                throw new IOException("Couldn't write object to hdfs - " + obj + " exception - " + e);
            }

            Text outputKey = new Text(obj.getName());

            output.collect(outputKey, successText);
        }
    }

    public static void main(String[] args) throws Exception {
        System.out.println("NOTE: Setting up logs from conf file - " + DedupStor.DEFAULT_LOG4J_FILE);

        PropertyConfigurator.configure(DedupStor.DEFAULT_LOG4J_FILE);

        JobConf conf = new JobConf(DedupStorHadoopCreateObjectsMapReduce.class);
        conf.setJobName("dedup-create-objects");

        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(DedupObjectSegmentWritable.class);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);

        conf.setMapperClass(DedupStorHadoopCreateObjectsMapper.class);
        conf.setReducerClass(DedupStorHadoopCreateObjectsReducer.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        Path inputPath = new Path(args[0], DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_OBJECTS_TMP_PATH);
        Path segmentStorPath = new Path(args[0],
                DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_SEGMENTS_LOC_SUFFIX);
        Path objectStorPath = new Path(args[0], DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_OBJECTS_LOC_SUFFIX);
        Path objectMapPath = new Path(args[0], DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_OBJECTS_TMP_PATH);

        conf.set(DedupStorHadoopUtils.HADOOP_CONF_SEGMENTS_STOR_PATH_KEY, segmentStorPath.toString());
        conf.set(DedupStorHadoopUtils.HADOOP_CONF_OBJECTS_STOR_PATH_KEY, objectStorPath.toString());
        conf.set(DedupStorHadoopUtils.HADOOP_CONF_OBJECTS_TMP_PATH_KEY, objectMapPath.toString());

        FileInputFormat.setInputPaths(conf, inputPath);
        FileOutputFormat.setOutputPath(conf, objectStorPath);

        JobClient.runJob(conf);
    }
}