com.moz.fiji.hive.FijiTableInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for com.moz.fiji.hive.FijiTableInputFormat.java

Source

/**
 * (c) Copyright 2013 WibiData, Inc.
 *
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.moz.fiji.hive;

import java.io.IOException;
import java.util.Collection;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.moz.fiji.hive.io.FijiRowDataWritable;
import com.moz.fiji.schema.Fiji;
import com.moz.fiji.schema.FijiRegion;
import com.moz.fiji.schema.FijiTable;
import com.moz.fiji.schema.FijiURI;
import com.moz.fiji.schema.util.ResourceUtils;

/**
 * An input format that reads from Fiji Tables.
 *
 * <p>
 *   This input format exists in addition to the
 *   {@code com.moz.fiji.mapreduce.framework.FijiTableInputFormat} because we need one that is
 *   an instance of mapred.InputFormat (not mapreduce.InputFormat) for integration with hive.
 * </p>
 *
 * <p>
 *   The hook that hive provides for turning MapReduce records into rows of a 2-dimensional
 *   SQL-like table is called a Deserializer. Unfortunately, Deserializers only have access to
 *   the value of the record (not the key). This means that even though this input format
 *   generates ImmutableBytesWritable keys that contain the row key of the input fiji table, the
 *   Deserializer won't have access to it. Hence, all of the data required to generate the
 *   2-dimensional view of the row must be contained in the value (in this case, the HBase Result).
 * </p>
 */
public class FijiTableInputFormat implements InputFormat<ImmutableBytesWritable, FijiRowDataWritable> {
    private static final Logger LOG = LoggerFactory.getLogger(FijiTableInputFormat.class);

    public static final String CONF_FIJI_DATA_REQUEST_PREFIX = "fiji.data.request.";

    /**
     * Returns an object responsible for generating records contained in a
     * given input split.
     *
     * @param split The input split to create a record reader for.
     * @param job The job configuration.
     * @param reporter A job info reporter (for counters, status, etc.).
     * @return The record reader.
     * @throws IOException If there is an error.
     */
    @Override
    public RecordReader<ImmutableBytesWritable, FijiRowDataWritable> getRecordReader(InputSplit split, JobConf job,
            Reporter reporter) throws IOException {
        LOG.info("Getting record reader {}", split.getLocations());
        return new FijiTableRecordReader((FijiTableInputSplit) split, job);
    }

    /**
     * Returns an array of input splits to be used as input to map tasks.
     *
     * @param job The job configuration.
     * @param numTasks A hint from the MR framework for the number of mappers.
     * @return The specifications of each split.
     * @throws IOException If there is an error.
     */
    @Override
    public InputSplit[] getSplits(JobConf job, int numTasks) throws IOException {
        // TODO: Use the numTasks hint effectively. We just ignore it right now.

        final FijiURI fijiURI = getFijiURI(job);
        final InputSplit[] splits;

        Fiji fiji = null;
        FijiTable fijiTable = null;
        try {
            fiji = Fiji.Factory.open(fijiURI);
            fijiTable = fiji.openTable(fijiURI.getTable());

            // Get the start keys for each region in the table.
            List<FijiRegion> fijiRegions = fijiTable.getRegions();
            splits = new InputSplit[fijiRegions.size()];
            for (int i = 0; i < fijiRegions.size(); i++) {
                FijiRegion fijiRegion = fijiRegions.get(i);
                byte[] regionStartKey = fijiRegion.getStartKey();
                byte[] regionEndKey = fijiRegion.getEndKey();

                Collection<String> regionLocations = fijiRegion.getLocations();
                String regionHost = null;
                if (!regionLocations.isEmpty()) {
                    // TODO: Allow the usage of regions that aren't the first.
                    String regionLocation = regionLocations.iterator().next();
                    regionHost = regionLocation.substring(0, regionLocation.indexOf(":"));
                } else {
                    LOG.warn("No locations found for region: {}", fijiRegion.toString());
                }
                final Path dummyPath = FileInputFormat.getInputPaths(job)[0];
                splits[i] = new FijiTableInputSplit(fijiURI, regionStartKey, regionEndKey, regionHost, dummyPath);
            }
        } catch (IOException e) {
            LOG.warn("Unable to get region information.  Returning an empty list of splits.");
            LOG.warn(StringUtils.stringifyException(e));
            return new InputSplit[0];
        } finally {
            ResourceUtils.releaseOrLog(fijiTable);
            ResourceUtils.releaseOrLog(fiji);
        }
        return splits;
    }

    /**
     * Gets the name of the fiji table this input format will read from.
     *
     * @param conf The job configuration.
     * @return The name of the fiji table this input format will read from.
     */
    private static FijiURI getFijiURI(Configuration conf) {
        final String fijiURIString = conf.get(FijiTableInfo.FIJI_TABLE_URI);
        if (null == fijiURIString) {
            throw new RuntimeException("FijiTableInputFormat needs to be configured. " + "Please specify "
                    + FijiTableInfo.FIJI_TABLE_URI + " in the configuration.");
        }
        FijiURI fijiURI = FijiURI.newBuilder(fijiURIString).build();
        return fijiURI;
    }
}