HiveKeyIgnoringBAMOutputFormat.java Source code

Introduction

Here is the source code for HiveKeyIgnoringBAMOutputFormat.java
Source

// Copyright (c) 2013 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
//
// File created: 2013-02-05 15:14:48

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.Progressable;

import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;

import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;

import net.sf.picard.sam.SamFileHeaderMerger;

import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;

import fi.tkk.ics.hadoop.bam.KeyIgnoringBAMOutputFormat;
import fi.tkk.ics.hadoop.bam.KeyIgnoringBAMRecordWriter;
import fi.tkk.ics.hadoop.bam.SAMRecordWritable;

// Wraps KeyIgnoringBAMOutputFormat, providing Hive's custom RecordWriter API.
public class HiveKeyIgnoringBAMOutputFormat extends FileOutputFormat<Writable, SAMRecordWritable>
        implements HiveOutputFormat<Writable, SAMRecordWritable> {
    private KeyIgnoringBAMOutputFormat<Writable> wrappedOutputFormat = new KeyIgnoringBAMOutputFormat<Writable>();

    private void setSAMHeaderFrom(JobConf job) throws IOException {
        if (wrappedOutputFormat.getSAMHeader() != null)
            return;

        // XXX: We're not told where to take the SAM header from so we just merge
        // them all. There should probably be a better way of doing this.

        final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>();

        // The "best" sort order among the headers: unsorted if they're sorted
        // differently, otherwise their common sort order.
        SAMFileHeader.SortOrder sortOrder = null;

        // XXX: it seems that FileInputFormat.getInputPaths(job) will point to
        // the directories of the input tables in the query. I'm not sure if this
        // is always the case.
        for (final Path table : FileInputFormat.getInputPaths(job)) {
            final FileSystem fs = table.getFileSystem(job);
            for (final FileStatus stat : fs.listStatus(table)) {
                if (!stat.isFile())
                    throw new IOException("Unexpected directory '" + stat.getPath() + "', expected only files");

                final SAMFileReader r = new SAMFileReader(fs.open(stat.getPath()));
                final SAMFileHeader h = r.getFileHeader();
                r.close();
                headers.add(h);

                if (sortOrder == null) {
                    sortOrder = h.getSortOrder();
                    continue;
                }
                if (sortOrder == SAMFileHeader.SortOrder.unsorted)
                    continue;
                if (sortOrder != h.getSortOrder())
                    sortOrder = SAMFileHeader.SortOrder.unsorted;
            }
        }

        wrappedOutputFormat.setSAMHeader(new SamFileHeaderMerger(sortOrder, headers, true).getMergedHeader());
    }

    @Override
    public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf job, Path finalOutPath,
            final Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties,
            Progressable progress) throws IOException {
        setSAMHeaderFrom(job);

        final FakeTaskAttemptContext ctx = new FakeTaskAttemptContext(job);

        final org.apache.hadoop.mapreduce.RecordWriter<Writable, SAMRecordWritable> wrappedRecordWriter = wrappedOutputFormat
                .getRecordWriter(ctx, finalOutPath);

        return new FileSinkOperator.RecordWriter() {
            @Override
            public void write(Writable rec) throws IOException {
                try {
                    wrappedRecordWriter.write(null, (SAMRecordWritable) rec);
                } catch (InterruptedException e) {
                    throw new RuntimeException(e);
                }
            }

            @Override
            public void close(boolean abort) throws IOException {
                try {
                    wrappedRecordWriter.close(ctx);
                } catch (InterruptedException e) {
                    throw new RuntimeException(e);
                }
            }
        };
    }

    @Override
    public RecordWriter<Writable, SAMRecordWritable> getRecordWriter(FileSystem fs, JobConf job, String name,
            Progressable progress) throws IOException {
        setSAMHeaderFrom(job);

        final FakeTaskAttemptContext ctx = new FakeTaskAttemptContext(job);

        final org.apache.hadoop.mapreduce.RecordWriter<Writable, SAMRecordWritable> wrappedRecordWriter = wrappedOutputFormat
                .getRecordWriter(ctx, FileOutputFormat.getTaskOutputPath(job, name));

        return new RecordWriter<Writable, SAMRecordWritable>() {
            @Override
            public void write(Writable ignored, SAMRecordWritable rec) throws IOException {
                try {
                    wrappedRecordWriter.write(ignored, rec);
                } catch (InterruptedException e) {
                    throw new RuntimeException(e);
                }
            }

            @Override
            public void close(Reporter reporter) throws IOException {
                try {
                    wrappedRecordWriter.close(ctx);
                } catch (InterruptedException e) {
                    throw new RuntimeException(e);
                }
            }
        };
    }
}