com.hadoop.compression.lzo.LzoIndexer.java Source code

Introduction

Here is the source code for com.hadoop.compression.lzo.LzoIndexer.java
Source

/*
 * This file is part of Hadoop-Gpl-Compression.
 *
 * Hadoop-Gpl-Compression is free software: you can redistribute it
 * and/or modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * Hadoop-Gpl-Compression is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Hadoop-Gpl-Compression.  If not, see
 * <http://www.gnu.org/licenses/>.
 */

package com.hadoop.compression.lzo;

import java.io.IOException;
import java.net.URI;
import java.text.DecimalFormat;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class LzoIndexer {
    private static final Log LOG = LogFactory.getLog(LzoIndexer.class);

    private final Configuration conf_;
    private final String LZO_EXTENSION = new LzopCodec().getDefaultExtension();
    private final String INDENT_STRING = "  ";
    private final DecimalFormat df_;

    public LzoIndexer(Configuration conf) {
        conf_ = conf;
        df_ = new DecimalFormat("#0.00");
    }

    /**
     * Index the file given by lzoUri in its default filesystem.
     * 
     * @param lzoPath The file to index.
     * @throws IOException if file not found
     */
    public void index(Path lzoPath) throws IOException {
        indexInternal(lzoPath, 0);
    }

    /**
     * Return indented space for pretty printing.
     * 
     * @param nestingLevel The indentation level.
     * @return Indented space for the given indentation level.
     */
    private String getNesting(int nestingLevel) {
        StringBuffer sb = new StringBuffer();
        for (int i = 0; i < nestingLevel; i++) {
            sb.append(INDENT_STRING);
        }
        return sb.toString();
    }

    /**
     * Lzo index a given path, calling recursively to index directories when encountered.
     * Files are only indexed if they end in .lzo and have no existing .lzo.index file.
     * 
     * @param lzoPath The base path to index.
     * @param nestingLevel For pretty printing, the nesting level.
     * @throws IOException
     */
    private void indexInternal(Path lzoPath, int nestingLevel) throws IOException {
        FileSystem fs = FileSystem.get(URI.create(lzoPath.toString()), conf_);
        FileStatus fileStatus = fs.getFileStatus(lzoPath);

        // Recursively walk
        if (fileStatus.isDir()) {
            LOG.info(getNesting(nestingLevel) + "LZO Indexing directory " + lzoPath + "...");
            FileStatus[] statuses = fs.listStatus(lzoPath);
            for (FileStatus childStatus : statuses) {
                indexInternal(childStatus.getPath(), nestingLevel + 1);
            }
        } else if (lzoPath.toString().endsWith(LZO_EXTENSION)) {
            Path lzoIndexPath = new Path(lzoPath.toString() + LzoIndex.LZO_INDEX_SUFFIX);
            if (fs.exists(lzoIndexPath)) {
                LOG.info(getNesting(nestingLevel) + "[SKIP] LZO index file already exists for " + lzoPath + "\n");
            } else {
                long startTime = System.currentTimeMillis();
                long fileSize = fileStatus.getLen();

                LOG.info(getNesting(nestingLevel) + "[INDEX] LZO Indexing file " + lzoPath + ", size "
                        + df_.format(fileSize / (1024.0 * 1024.0 * 1024.0)) + " GB...");
                if (indexSingleFile(fs, lzoPath)) {
                    long indexSize = fs.getFileStatus(lzoIndexPath).getLen();
                    double elapsed = (System.currentTimeMillis() - startTime) / 1000.0;
                    LOG.info(getNesting(nestingLevel) + "Completed LZO Indexing in " + df_.format(elapsed)
                            + " seconds (" + df_.format(fileSize / (1024.0 * 1024.0 * elapsed))
                            + " MB/s).  Index size is " + df_.format(indexSize / 1024.0) + " KB.\n");
                }
            }
        }
    }

    /**
     * Create an lzo index for a single file in HDFS.
     * @param fs The filesystem object.
     * @param lzoPath The path to index (must be a file, not a directory).
     * @return
     */
    private boolean indexSingleFile(FileSystem fs, Path lzoPath) {
        try {
            LzoIndex.createIndex(fs, lzoPath);
            return true;
        } catch (IOException e) {
            LOG.error("Error indexing " + lzoPath, e);
            return false;
        }
    }

    /**
     * Run the LzoIndexer on each argument passed via stdin.  The files should be HDFS locations.
     * @param args arguments
     */
    public static void main(String[] args) {
        if (args.length == 0) {
            printUsage();
            System.exit(1);
        }

        LzoIndexer lzoIndexer = new LzoIndexer(new Configuration());
        for (String arg : args) {
            try {
                lzoIndexer.index(new Path(arg));
            } catch (IOException e) {
                LOG.error("Error indexing " + arg, e);
            }
        }
    }

    public static void printUsage() {
        System.out.println(
                "Usage: hadoop jar /path/to/this/jar com.hadoop.compression.lzo.LzoIndexer <file.lzo | directory> [file2.lzo directory3 ...]");
    }
}