org.archive.wayback.hadoop.CDXSortDriver.java Source code

Introduction

Here is the source code for org.archive.wayback.hadoop.CDXSortDriver.java
Source

/*
 *  This file is part of the Wayback archival access software
 *   (http://archive-access.sourceforge.net/projects/wayback/).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.wayback.hadoop;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.archive.wayback.util.ByteOp;

/**
 * @author brad
 *
 */
public class CDXSortDriver implements Tool {
    Configuration conf = null;
    /**
     * As hard-coded into the Text RecordWriter
     */
    public static String TEXT_OUTPUT_DELIM_CONFIG = "mapred.textoutputformat.separator";

    private static int countLinesInPath(Path path, Configuration conf) throws IOException {
        FileSystem fs = path.getFileSystem(conf);
        FSDataInputStream is = fs.open(path);
        BufferedReader br = new BufferedReader(new InputStreamReader(is, ByteOp.UTF8));
        int lineCount = 0;
        while (br.readLine() != null) {
            lineCount++;
        }
        is.close();
        return lineCount;
    }

    static int printUsage() {
        System.out.println("cdxsort <split> <input> <output>");
        System.out.println("cdxsort [OPTIONS] <split> <input> <output>");
        System.out.println("\tOPTIONS can be:");
        System.out.println("\t\t-m NUM - try to run with approximately NUM map tasks");
        System.out.println("\t\t--compressed-input - assume input is compressed, even without .gz suffix");
        System.out.println("\t\t--gzip-range - assume input lines are PATH START LENGTH such that a");
        System.out.println("\t\t\t valid gzip record exists in PATH between START and START+LENGTH");
        System.out.println("\t\t\t that contains the records to process");
        System.out.println("\t\t--compress-output - compress output files with GZip");
        System.out.println(
                "\t\t--delimiter DELIM - assume DELIM delimter for input and output, instead of default <SPACE>");
        System.out.println("\t\t--map-global - use the GLOBAL CDX map function, which implies:");
        System.out.println(
                "\t\t\t. extra trailing field indicating HTML meta NOARCHIVE data, which should be omitted, result lines do not include the last field");
        System.out.println("\t\t\t. truncating digest field to 3 digits");
        System.out.println("\t\t\t. column 0 is original URL (identity CDX files)");
        System.out.println();
        //      ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    /**
     * The main driver for sort program. Invoke this method to submit the
     * map/reduce job.
     * 
     * @throws IOException
     *             When there is communication problems with the job tracker.
     */
    public int run(String[] args) throws Exception {

        String delim = " ";

        long desiredMaps = 10;
        boolean compressOutput = false;
        boolean compressedInput = false;
        boolean gzipRange = false;
        List<String> otherArgs = new ArrayList<String>();
        int mapMode = CDXCanonicalizingMapper.MODE_FULL;
        for (int i = 0; i < args.length; ++i) {
            try {
                if ("-m".equals(args[i])) {
                    desiredMaps = Integer.parseInt(args[++i]);
                } else if ("--compress-output".equals(args[i])) {
                    compressOutput = true;
                } else if ("--compressed-input".equals(args[i])) {
                    compressedInput = true;
                } else if ("--gzip-range".equals(args[i])) {
                    gzipRange = true;
                } else if ("--delimiter".equals(args[i])) {
                    delim = args[++i];
                } else if ("--map-full".equals(args[i])) {
                    mapMode = CDXCanonicalizingMapper.MODE_FULL;
                } else if ("--map-global".equals(args[i])) {
                    mapMode = CDXCanonicalizingMapper.MODE_GLOBAL;
                } else {
                    otherArgs.add(args[i]);
                }
            } catch (NumberFormatException except) {
                System.out.println("ERROR: Integer expected instead of " + args[i]);
                return printUsage();
            } catch (ArrayIndexOutOfBoundsException except) {
                System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
                return printUsage(); // exits
            }
        }

        // Make sure there are exactly 3 parameters left: split input output
        if (otherArgs.size() != 3) {
            System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 3.");
            return printUsage();
        }

        String splitPathString = otherArgs.get(0);
        String inputPathString = otherArgs.get(1);
        String outputPathString = otherArgs.get(2);

        Path splitPath = new Path(splitPathString);
        Path inputPath = new Path(inputPathString);
        Path outputPath = new Path(outputPathString);

        Job job = new Job(getConf(), "cdx-sort");
        Configuration conf = job.getConfiguration();
        job.setJarByClass(CDXSortDriver.class);

        job.setMapperClass(CDXCanonicalizingMapper.class);

        job.setReducerClass(CDXReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // configure the "map mode"
        CDXCanonicalizingMapper.setMapMode(conf, mapMode);

        // set up the delimter:
        conf.set(TEXT_OUTPUT_DELIM_CONFIG, delim);

        if (compressOutput) {
            FileOutputFormat.setCompressOutput(job, true);
            FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
        }

        // set up the Partitioner, including number of reduce tasks:
        FileSystem fs = inputPath.getFileSystem(conf);

        int splitCount = countLinesInPath(splitPath, conf);
        System.err.println("Split/Reduce count:" + splitCount);
        job.setNumReduceTasks(splitCount);

        AlphaPartitioner.setPartitionPath(conf, splitPathString);
        job.setPartitionerClass(AlphaPartitioner.class);

        // calculate the byte size to get the correct number of map tasks:
        FileStatus inputStatus = fs.getFileStatus(inputPath);
        long inputLen = inputStatus.getLen();
        long bytesPerMap = (int) inputLen / desiredMaps;

        FileInputFormat.addInputPath(job, inputPath);
        FileInputFormat.setMaxInputSplitSize(job, bytesPerMap);
        if (gzipRange) {
            job.setInputFormatClass(GZIPRangeLineDereferencingInputFormat.class);
        } else {
            job.setInputFormatClass(LineDereferencingInputFormat.class);
            if (compressedInput) {
                LineDereferencingRecordReader.forceCompressed(conf);
            }
        }
        FileOutputFormat.setOutputPath(job, outputPath);

        return (job.waitForCompletion(true) ? 0 : 1);
    }

    /**
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new CDXSortDriver(), args);
        System.exit(res);
    }

    public Configuration getConf() {
        return conf;
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
    }

}