org.apache.hadoop.examples.terasort.TeraOutputFormat.java Source code

Introduction

Here is the source code for org.apache.hadoop.examples.terasort.TeraOutputFormat.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.examples.terasort;

import java.io.FileNotFoundException;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileAlreadyExistsException;
import org.apache.hadoop.mapred.InvalidJobConfException;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.security.TokenCache;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * An output format that writes the key and value appended together.
 */
public class TeraOutputFormat extends FileOutputFormat<Text, Text> {
    private static final Logger LOG = LoggerFactory.getLogger(TeraOutputFormat.class);
    private OutputCommitter committer = null;

    /**
     * Set the requirement for a final sync before the stream is closed.
     */
    static void setFinalSync(JobContext job, boolean newValue) {
        job.getConfiguration().setBoolean(TeraSortConfigKeys.FINAL_SYNC_ATTRIBUTE.key(), newValue);
    }

    /**
     * Does the user want a final sync at close?
     */
    public static boolean getFinalSync(JobContext job) {
        return job.getConfiguration().getBoolean(TeraSortConfigKeys.FINAL_SYNC_ATTRIBUTE.key(),
                TeraSortConfigKeys.DEFAULT_FINAL_SYNC_ATTRIBUTE);
    }

    static class TeraRecordWriter extends RecordWriter<Text, Text> {
        private boolean finalSync = false;
        private FSDataOutputStream out;

        public TeraRecordWriter(FSDataOutputStream out, JobContext job) {
            finalSync = getFinalSync(job);
            this.out = out;
        }

        public synchronized void write(Text key, Text value) throws IOException {
            out.write(key.getBytes(), 0, key.getLength());
            out.write(value.getBytes(), 0, value.getLength());
        }

        public void close(TaskAttemptContext context) throws IOException {
            if (finalSync) {
                try {
                    out.hsync();
                } catch (UnsupportedOperationException e) {
                    /*
                     * Currently, hsync operation on striping file with erasure code
                     * policy is not supported yet. So this is a workaround to make
                     * teragen and terasort to support directory with striping files. In
                     * future, if the hsync operation is supported on striping file, this
                     * workaround should be removed.
                     */
                    LOG.info("Operation hsync is not supported so far on path with " + "erasure code policy set");
                }
            }
            out.close();
        }
    }

    @Override
    public void checkOutputSpecs(JobContext job) throws InvalidJobConfException, IOException {
        // Ensure that the output directory is set
        Path outDir = getOutputPath(job);
        if (outDir == null) {
            throw new InvalidJobConfException("Output directory not set in JobConf.");
        }

        final Configuration jobConf = job.getConfiguration();

        // get delegation token for outDir's file system
        TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { outDir }, jobConf);

        final FileSystem fs = outDir.getFileSystem(jobConf);

        try {
            // existing output dir is considered empty iff its only content is the
            // partition file.
            //
            final FileStatus[] outDirKids = fs.listStatus(outDir);
            boolean empty = false;
            if (outDirKids != null && outDirKids.length == 1) {
                final FileStatus st = outDirKids[0];
                final String fname = st.getPath().getName();
                empty = !st.isDirectory() && TeraInputFormat.PARTITION_FILENAME.equals(fname);
            }
            if (TeraSort.getUseSimplePartitioner(job) || !empty) {
                throw new FileAlreadyExistsException("Output directory " + outDir + " already exists");
            }
        } catch (FileNotFoundException ignored) {
        }
    }

    public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job) throws IOException {
        Path file = getDefaultWorkFile(job, "");
        FileSystem fs = file.getFileSystem(job.getConfiguration());
        FSDataOutputStream fileOut = fs.create(file);
        return new TeraRecordWriter(fileOut, job);
    }

    public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException {
        if (committer == null) {
            Path output = getOutputPath(context);
            committer = new FileOutputCommitter(output, context);
        }
        return committer;
    }
}