voldemort.store.readonly.mr.utils.AvroUtils.java Source code

Introduction

Here is the source code for voldemort.store.readonly.mr.utils.AvroUtils.java
Source

/*
 * Copyright 2008-2009 LinkedIn, Inc
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package voldemort.store.readonly.mr.utils;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class AvroUtils {

    /**
     * Pull the schema off of the given file (if it is a file). If it is a
     * directory, then pull schemas off of all subfiles, and check that they are
     * all the same schema. If so, return that schema, otherwise throw an
     * exception
     * 
     * @param fs The filesystem to use
     * @param path The path from which to get the schema
     * @param checkSameSchema boolean flag to check all files in directory for
     *        same schema
     * @return The schema of this file or all its subfiles
     * @throws IOException
     */

    @SuppressWarnings({ "unchecked", "rawtypes" })
    private static Schema getSchemaFromPath(FileSystem fs, Path path, boolean checkSameSchema) {

        try {
            if (fs.isFile(path)) {
                BufferedInputStream inStream = null;
                try {
                    inStream = new BufferedInputStream(fs.open(path));
                } catch (IOException e1) {
                    // TODO Auto-generated catch block
                    e1.printStackTrace();
                }
                GenericDatumReader datum = new GenericDatumReader();

                DataFileStream reader = null;
                try {
                    reader = new DataFileStream(inStream, datum);
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                return reader.getSchema();
            } else {
                FileStatus[] statuses = null;
                if (fs.isDirectory(path)) {
                    // this is a directory, get schemas from all subfiles
                    statuses = fs.listStatus(path);
                } else {
                    // this is wildcard path, get schemas from all matched files
                    statuses = fs.globStatus(path);
                }
                if (statuses == null || statuses.length == 0)
                    throw new IllegalArgumentException("No files found in path pattern " + path.toUri().getPath());
                List<Schema> schemas = new ArrayList<Schema>();
                for (FileStatus status : statuses) {
                    if (!HadoopUtils.shouldPathBeIgnored(status.getPath())) {
                        if (!checkSameSchema) {
                            // return first valid schema w/o checking all files
                            return getSchemaFromPath(fs, status.getPath(), checkSameSchema);
                        }
                        schemas.add(getSchemaFromPath(fs, status.getPath(), checkSameSchema));
                    }
                }

                // now check that all the schemas are the same
                if (schemas.size() > 0) {
                    Schema schema = schemas.get(0);
                    for (int i = 1; i < schemas.size(); i++)
                        if (!schema.equals(schemas.get(i)))
                            throw new IllegalArgumentException("The directory " + path.toString()
                                    + " contains heterogenous schemas: found both '" + schema.toString() + "' and '"
                                    + schemas.get(i).toString() + "'.");

                    return schema;
                } else {
                    throw new IllegalArgumentException("No Valid metadata file found for Path:" + path.toString());
                }
            }
        } catch (Exception e) {
            // logger.error("failed to get metadata from path:" + path);
            throw new RuntimeException(e);
        }

    }

    public static Schema getAvroSchemaFromPath(Path path) throws IOException {
        return getSchemaFromPath(path.getFileSystem(new Configuration()), path, true);
    }

}