com.cloudera.recordbreaker.analyzer.FormatAnalyzer.java Source code

Introduction

Here is the source code for com.cloudera.recordbreaker.analyzer.FormatAnalyzer.java
Source

/*
 * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.analyzer;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Iterator;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.generic.GenericData;

/*********************************************************************************
 * <code>FormatAnalyzer</code> takes an arbitrary input file and generates a
 * file-appropriate data descriptor.  Depending on the filetype, that descriptor
 * can entail a large or small amount of work.  It can also yield a large or small
 * amount of metadata about the file's contents.
 *
 * @author "Michael Cafarella"
 * @version 1.0
 * @since 1.0
 **********************************************************************************/
public class FormatAnalyzer {
    private static final Log LOG = LogFactory.getLog(FormatAnalyzer.class);
    final static int MAX_ANALYSIS_LINES = 400;
    File schemaDbDir;

    /**
     * Creates a new <code>FormatAnalyzer</code> instance.
     */
    public FormatAnalyzer(File schemaDbDir) {
        this.schemaDbDir = schemaDbDir;
    }

    /**
     * Create a file-appropriate DataDescriptor instance.
     *
     * Right now we just use the file ending to figure out what to do,
     * but this will become unsatisfactory pretty quickly.
     *
     * @param f a <code>File</code> value
     * @return a <code>DataDescriptor</code> value
     */
    public DataDescriptor describeData(FileSystem fs, Path p) throws IOException {
        FileStatus fstatus = fs.getFileStatus(p);
        String fname = p.getName();

        // Test to see if the file is one of a handful of known structured formats.
        if (CSVDataDescriptor.isCSV(fs, p)) {
            return new CSVDataDescriptor(p, fs);
        } else if (fname.endsWith(".xml")) {
            return new XMLDataDescriptor(p, fs);
        } else if (fname.endsWith(".avro")) {
            return new AvroDataDescriptor(p, fs);
        } else if (AvroSequenceFileDataDescriptor.isAvroSequenceFile(fs, p)) {
            return new AvroSequenceFileDataDescriptor(p, fs);
        } else if (SequenceFileDataDescriptor.isSequenceFile(fs, p)) {
            return new SequenceFileDataDescriptor(p, fs);
        } else if (ApacheDataDescriptor.isApacheLogFile(fs, p)) {
            return new ApacheDataDescriptor(p, fs);
        } else if (SyslogDataDescriptor.isSyslogFile(fs, p)) {
            return new SyslogDataDescriptor(p, fs);
        } else {
            // It's not one of the known formats, so apply LearnStructure 
            // to obtain the structure.
            if (UnknownTextDataDescriptor.isTextData(fs, p)) {
                try {
                    return new UnknownTextDataDescriptor(fs, p, schemaDbDir);
                } catch (Exception iex) {
                    //iex.printStackTrace();
                }
            }
            // If that doesn't work, then give up and call it unstructured.  You
            // can't run queries on data in this format.
            return new UnstructuredFileDescriptor(fs, p);
        }
    }

    public DataDescriptor loadDataDescriptor(FileSystem fs, Path p, String identifier, List<String> schemaReprs,
            List<String> schemaDescs, List<byte[]> schemaBlobs) throws IOException {
        if (AvroDataDescriptor.AVRO_TYPE.equals(identifier)) {
            return new AvroDataDescriptor(p, fs, schemaReprs, schemaDescs, schemaBlobs);
        } else if (CSVDataDescriptor.CSV_TYPE.equals(identifier)) {
            return new CSVDataDescriptor(p, fs, schemaReprs, schemaDescs, schemaBlobs);
        } else if (SequenceFileDataDescriptor.SEQFILE_TYPE.equals(identifier)) {
            return new SequenceFileDataDescriptor(p, fs, schemaReprs, schemaDescs, schemaBlobs);
        } else if (XMLDataDescriptor.XML_TYPE.equals(identifier)) {
            return new XMLDataDescriptor(p, fs, schemaReprs, schemaDescs, schemaBlobs);
        } else if (AvroSequenceFileDataDescriptor.AVROSEQFILE_TYPE.equals(identifier)) {
            return new AvroSequenceFileDataDescriptor(p, fs, schemaReprs, schemaDescs, schemaBlobs);
        } else if (ApacheDataDescriptor.APACHE_TYPE.equals(identifier)) {
            return new ApacheDataDescriptor(p, fs, schemaReprs, schemaDescs, schemaBlobs);
        } else if (SyslogDataDescriptor.SYSLOG_TYPE.equals(identifier)) {
            return new SyslogDataDescriptor(p, fs, schemaReprs, schemaDescs, schemaBlobs);
        } else if (UnknownTextDataDescriptor.TEXTDATA_TYPE.equals(identifier)) {
            return new UnknownTextDataDescriptor(fs, p, schemaReprs, schemaDescs, schemaBlobs);
        } else {
            return new UnstructuredFileDescriptor(fs, p);
        }
    }

    /**
     * Describe <code>main</code> method here.
     *
     * @param argv[] a <code>String</code> value
     * @exception IOException if an error occurs
     */
    public static void main(String argv[]) throws IOException {
        if (argv.length < 1) {
            System.err.println("Usage: FormatAnalyzer <inputfile> <schemaDbDir>");
            return;
        }

        FileSystem fs = FileSystem.getLocal(null);
        Path inputFile = new Path(new File(argv[0]).getCanonicalPath());
        File schemaDbDir = new File(argv[1]).getCanonicalFile();
        FormatAnalyzer fa = new FormatAnalyzer(schemaDbDir);

        DataDescriptor descriptor = fa.describeData(fs, inputFile);
        System.err.println("Filename: " + descriptor.getFilename());
        System.err.println("Filetype identifier: " + descriptor.getFileTypeIdentifier());
        List<SchemaDescriptor> schemas = descriptor.getSchemaDescriptor();
        if (schemas == null) {
            System.err.println("No schema found.");
        } else {
            System.err.println("Num schemas found: " + schemas.size());
            System.err.println();
            for (SchemaDescriptor sd : schemas) {
                Schema s = sd.getSchema();
                System.err.println("Schema src desc: " + sd.getSchemaSourceDescription());
                System.err.println();
                System.err.println("Schema identifier: " + sd.getSchemaIdentifier());
                System.err.println();
                int i = 0;
                for (Iterator it = sd.getIterator(); it.hasNext();) {
                    GenericData.Record curRow = (GenericData.Record) it.next();
                    System.err.println(i + ". Elt: " + curRow);
                    i++;
                }
            }
        }
    }
}