org.apache.hadoop.hive.hbase.HiveHFileOutputFormat.java Source code

Introduction

Here is the source code for org.apache.hadoop.hive.hbase.HiveHFileOutputFormat.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.hbase;

import java.io.IOException;
import java.io.InterruptedIOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.commons.lang.NotImplementedException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellComparator;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.KeyValueUtil;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Progressable;

/**
 * HiveHFileOutputFormat implements HiveOutputFormat for HFile bulk
 * loading.  Until HBASE-1861 is implemented, it can only be used
 * for loading a table with a single column family.
 */
public class HiveHFileOutputFormat extends HFileOutputFormat
        implements HiveOutputFormat<ImmutableBytesWritable, KeyValue> {

    public static final String HFILE_FAMILY_PATH = "hfile.family.path";

    static final Logger LOG = LoggerFactory.getLogger(HiveHFileOutputFormat.class.getName());

    private org.apache.hadoop.mapreduce.RecordWriter<ImmutableBytesWritable, KeyValue> getFileWriter(
            org.apache.hadoop.mapreduce.TaskAttemptContext tac) throws IOException {
        try {
            return super.getRecordWriter(tac);
        } catch (InterruptedException ex) {
            throw new IOException(ex);
        }
    }

    /**
     * Retrieve the family path, first check the JobConf, then the table properties.
     * @return the family path or null if not specified.
     */
    public static String getFamilyPath(Configuration jc, Properties tableProps) {
        return jc.get(HFILE_FAMILY_PATH, tableProps.getProperty(HFILE_FAMILY_PATH));
    }

    @Override
    public RecordWriter getHiveRecordWriter(final JobConf jc, final Path finalOutPath,
            Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties,
            final Progressable progressable) throws IOException {

        // Read configuration for the target path, first from jobconf, then from table properties
        String hfilePath = getFamilyPath(jc, tableProperties);
        if (hfilePath == null) {
            throw new RuntimeException("Please set " + HFILE_FAMILY_PATH + " to target location for HFiles");
        }

        // Target path's last component is also the column family name.
        final Path columnFamilyPath = new Path(hfilePath);
        final String columnFamilyName = columnFamilyPath.getName();
        final byte[] columnFamilyNameBytes = Bytes.toBytes(columnFamilyName);
        final Job job = new Job(jc);
        setCompressOutput(job, isCompressed);
        setOutputPath(job, finalOutPath);

        // Create the HFile writer
        final org.apache.hadoop.mapreduce.TaskAttemptContext tac = ShimLoader.getHadoopShims()
                .newTaskAttemptContext(job.getConfiguration(), progressable);

        final Path outputdir = FileOutputFormat.getOutputPath(tac);
        final Path taskAttemptOutputdir = new FileOutputCommitter(outputdir, tac).getWorkPath();
        final org.apache.hadoop.mapreduce.RecordWriter<ImmutableBytesWritable, KeyValue> fileWriter = getFileWriter(
                tac);

        // Individual columns are going to be pivoted to HBase cells,
        // and for each row, they need to be written out in order
        // of column name, so sort the column names now, creating a
        // mapping to their column position.  However, the first
        // column is interpreted as the row key.
        String columnList = tableProperties.getProperty("columns");
        String[] columnArray = columnList.split(",");
        final SortedMap<byte[], Integer> columnMap = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
        int i = 0;
        for (String columnName : columnArray) {
            if (i != 0) {
                columnMap.put(Bytes.toBytes(columnName), i);
            }
            ++i;
        }

        return new RecordWriter() {

            @Override
            public void close(boolean abort) throws IOException {
                try {
                    fileWriter.close(null);
                    if (abort) {
                        return;
                    }
                    // Move the hfiles file(s) from the task output directory to the
                    // location specified by the user.
                    FileSystem fs = outputdir.getFileSystem(jc);
                    fs.mkdirs(columnFamilyPath);
                    Path srcDir = taskAttemptOutputdir;
                    for (;;) {
                        FileStatus[] files = fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER);
                        if ((files == null) || (files.length == 0)) {
                            throw new IOException("No family directories found in " + srcDir);
                        }
                        if (files.length != 1) {
                            throw new IOException("Multiple family directories found in " + srcDir);
                        }
                        srcDir = files[0].getPath();
                        if (srcDir.getName().equals(columnFamilyName)) {
                            break;
                        }
                        if (files[0].isFile()) {
                            throw new IOException("No family directories found in " + taskAttemptOutputdir + ". "
                                    + "The last component in hfile path should match column family name "
                                    + columnFamilyName);
                        }
                    }
                    for (FileStatus regionFile : fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER)) {
                        fs.rename(regionFile.getPath(), new Path(columnFamilyPath, regionFile.getPath().getName()));
                    }
                    // Hive actually wants a file as task output (not a directory), so
                    // replace the empty directory with an empty file to keep it happy.
                    fs.delete(taskAttemptOutputdir, true);
                    fs.createNewFile(taskAttemptOutputdir);
                } catch (InterruptedException ex) {
                    throw new IOException(ex);
                }
            }

            private void writeText(Text text) throws IOException {
                // Decompose the incoming text row into fields.
                String s = text.toString();
                String[] fields = s.split("\u0001");
                assert (fields.length <= (columnMap.size() + 1));
                // First field is the row key.
                byte[] rowKeyBytes = Bytes.toBytes(fields[0]);
                // Remaining fields are cells addressed by column name within row.
                for (Map.Entry<byte[], Integer> entry : columnMap.entrySet()) {
                    byte[] columnNameBytes = entry.getKey();
                    int iColumn = entry.getValue();
                    String val;
                    if (iColumn >= fields.length) {
                        // trailing blank field
                        val = "";
                    } else {
                        val = fields[iColumn];
                        if ("\\N".equals(val)) {
                            // omit nulls
                            continue;
                        }
                    }
                    byte[] valBytes = Bytes.toBytes(val);
                    KeyValue kv = new KeyValue(rowKeyBytes, columnFamilyNameBytes, columnNameBytes, valBytes);
                    try {
                        fileWriter.write(null, kv);
                    } catch (IOException e) {
                        LOG.error("Failed while writing row: " + s);
                        throw e;
                    } catch (InterruptedException ex) {
                        throw new IOException(ex);
                    }
                }
            }

            private void writePut(PutWritable put) throws IOException {
                ImmutableBytesWritable row = new ImmutableBytesWritable(put.getPut().getRow());
                SortedMap<byte[], List<Cell>> cells = put.getPut().getFamilyCellMap();
                for (Map.Entry<byte[], List<Cell>> entry : cells.entrySet()) {
                    Collections.sort(entry.getValue(), new CellComparator());
                    for (Cell c : entry.getValue()) {
                        try {
                            fileWriter.write(row, KeyValueUtil.copyToNewKeyValue(c));
                        } catch (InterruptedException e) {
                            throw (InterruptedIOException) new InterruptedIOException().initCause(e);
                        }
                    }
                }
            }

            @Override
            public void write(Writable w) throws IOException {
                if (w instanceof Text) {
                    writeText((Text) w);
                } else if (w instanceof PutWritable) {
                    writePut((PutWritable) w);
                } else {
                    throw new IOException("Unexpected writable " + w);
                }
            }
        };
    }

    @Override
    public void checkOutputSpecs(FileSystem ignored, JobConf jc) throws IOException {
        //delegate to the new api
        Job job = new Job(jc);
        JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);

        checkOutputSpecs(jobContext);
    }

    @Override
    public org.apache.hadoop.mapred.RecordWriter<ImmutableBytesWritable, KeyValue> getRecordWriter(
            FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException {
        throw new NotImplementedException("This will not be invoked");
    }
}