com.awcoleman.StandaloneJava.AvroCounterByRecord.java Source code

Java tutorial

Introduction

Here is the source code for com.awcoleman.StandaloneJava.AvroCounterByRecord.java

Source

package com.awcoleman.StandaloneJava;

import java.io.IOException;
import java.util.ArrayList;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;

/*
 * 
 * Standalone java program that counts Avro records in hdfs file or directory.
 * 
 * 
 * //TODO-- expand to handle local fs or hdfs. Look at how Hadoop handles URIs
 *    also if no URI is specified check HDFS first and if no match, check localfs
 * 
 * //TODO -- allow HADOOP_CONF variable. Is the standard way to grab from sys env or from -D??
 * 
 * 
 * @author awcoleman
 * license: Apache License 2.0; http://www.apache.org/licenses/LICENSE-2.0 *
 */
public class AvroCounterByRecord {

    public AvroCounterByRecord(String inDirStr) throws IOException {

        long numAvroRecords = 0;

        //Get list of input files
        ArrayList<FileStatus> inputFileList = new ArrayList<FileStatus>();

        Configuration conf = new Configuration();
        conf.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
        conf.set("dfs.replication", "1"); //see http://stackoverflow.com/questions/24548699/how-to-append-to-an-hdfs-file-on-an-extremely-small-cluster-3-nodes-or-less

        FileSystem hdfs = null;
        try {
            hdfs = FileSystem.get(conf);
        } catch (java.io.IOException ioe) {
            System.out.println("Error opening HDFS filesystem. Exiting. Error message: " + ioe.getMessage());
            System.exit(1);
        }
        if (hdfs.getStatus() == null) {
            System.out.println("Unable to contact HDFS filesystem. Exiting.");
            System.exit(1);
        }

        //Check if input dirs/file exists and get file list (even if list of single file)
        Path inPath = new Path(inDirStr);
        if (hdfs.exists(inPath) && hdfs.isFile(inPath)) { //single file
            inputFileList.add(hdfs.getFileStatus(inPath));
        } else if (hdfs.exists(inPath) && hdfs.isDirectory(inPath)) { //dir
            //Get list of input files
            RemoteIterator<LocatedFileStatus> fileStatusListIterator = hdfs.listFiles(inPath, true);
            while (fileStatusListIterator.hasNext()) {
                LocatedFileStatus fileStatus = fileStatusListIterator.next();

                if (fileStatus.isFile() && !fileStatus.getPath().getName().equals("_SUCCESS")) {
                    inputFileList.add((FileStatus) fileStatus);
                }
            }
        } else {
            System.out.println("Input directory ( " + inDirStr + " ) not found or is not directory. Exiting.");
            System.exit(1);
        }

        for (FileStatus thisFileStatus : inputFileList) {

            //_SUCCESS files are 0 bytes
            if (thisFileStatus.getLen() == 0) {
                continue;
            }

            DataFileStream<Object> avroStream = null;
            FSDataInputStream inStream = hdfs.open(thisFileStatus.getPath());
            GenericDatumReader<Object> reader = new GenericDatumReader<Object>();
            avroStream = new DataFileStream<Object>(inStream, reader);

            long thisFileRecords = 0;

            while (avroStream.hasNext()) {
                numAvroRecords++;
                thisFileRecords++;
                avroStream.next();
            }
            avroStream.close();
            inStream.close();

            System.out.println("Input file " + thisFileStatus.getPath() + " has " + thisFileRecords + " records.");

            //TODO test on dir with non-avro file and see what the exception is, catch that and log to output but don't die.
        }

        System.out.println("Input dir/file ( " + inDirStr + " ) has " + inputFileList.size() + " files and "
                + numAvroRecords + " total records.");

    }

    public static void main(String[] args) throws IOException {
        if (args.length < 1) {
            System.out.println("Requires an input directory (containing Avro files) or Avro filename. Exiting.");
            System.exit(1);
        }

        AvroCounterByRecord mainObj = new AvroCounterByRecord(args[0]);

    }

}