datafu.hourglass.avro.AvroDateRangeMetadata.java Source code

Java tutorial

Introduction

Here is the source code for datafu.hourglass.avro.AvroDateRangeMetadata.java

Source

/**
* Copyright 2013 LinkedIn, Inc
* 
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
* 
* http://www.apache.org/licenses/LICENSE-2.0
* 
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package datafu.hourglass.avro;

import java.io.IOException;
import java.util.Date;

import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import datafu.hourglass.fs.DateRange;
import datafu.hourglass.fs.PathUtils;

/**
 * Manages the storage and retrieval of date ranges in the metadata of Avro files.
 * This is used by {@link datafu.hourglass.jobs.AbstractPartitionCollapsingIncrementalJob} so that when reusing previous
 * output it can determine the date range the data corresponds to.
 * 
 * @author "Matthew Hayes"
 *
 */
public class AvroDateRangeMetadata {
    public static String METADATA_DATE_START = "hourglass.date.start";
    public static String METADATA_DATE_END = "hourglass.date.end";

    /**
     * Reads the date range from the metadata stored in an Avro file.
     * 
     * @param fs file system to access path
     * @param path path to get date range for
     * @return date range
     * @throws IOException
     */
    public static DateRange getOutputFileDateRange(FileSystem fs, Path path) throws IOException {
        path = fs.listStatus(path, PathUtils.nonHiddenPathFilter)[0].getPath();
        FSDataInputStream dataInputStream = fs.open(path);
        DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
        DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader);

        try {
            return new DateRange(new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_START))),
                    new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_END))));
        } finally {
            dataFileStream.close();
            dataInputStream.close();
        }
    }

    /**
     * Updates the Hadoop configuration so that the Avro files which are written have date range
     * information stored in the metadata.  This should be used in conjunction with 
     * {@link AvroKeyValueWithMetadataRecordWriter}.
     * 
     * @param conf configuration to store date range in
     * @param dateRange date range
     */
    public static void configureOutputDateRange(Configuration conf, DateRange dateRange) {
        // store the date range in the output file's metadata
        conf.set(AvroKeyValueWithMetadataRecordWriter.TEXT_PREFIX + METADATA_DATE_START,
                Long.toString(dateRange.getBeginDate().getTime()));
        conf.set(AvroKeyValueWithMetadataRecordWriter.TEXT_PREFIX + METADATA_DATE_END,
                Long.toString(dateRange.getEndDate().getTime()));
    }
}