com.ibm.jaql.io.hadoop.DefaultHadoopOutputAdapter.java Source code

Introduction

Here is the source code for com.ibm.jaql.io.hadoop.DefaultHadoopOutputAdapter.java
Source

/*
 * Copyright (C) IBM Corp. 2008.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.ibm.jaql.io.hadoop;

import java.io.IOException;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.ExposeJobContext;
import org.apache.hadoop.mapred.ExposeTaskAttemptContext;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.JobContext;
import org.apache.hadoop.mapred.OutputCommitter;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TaskAttemptContext;
import org.apache.hadoop.mapred.TaskAttemptID;
import org.apache.hadoop.mapred.TaskID;
import org.apache.hadoop.mapred.JobID;
import org.apache.hadoop.util.Progressable;
import org.apache.log4j.Logger;

import com.ibm.jaql.io.Adapter;
import com.ibm.jaql.io.AdapterStore;
import com.ibm.jaql.io.ClosableJsonWriter;
import com.ibm.jaql.io.hadoop.converter.KeyValueExport;
import com.ibm.jaql.json.type.BufferedJsonArray;
import com.ibm.jaql.json.type.BufferedJsonRecord;
import com.ibm.jaql.json.type.JsonBool;
import com.ibm.jaql.json.type.JsonRecord;
import com.ibm.jaql.json.type.JsonString;
import com.ibm.jaql.json.type.JsonValue;

/**
 * The default class for writing Items from jaql to Hadoop
 */
public class DefaultHadoopOutputAdapter<K, V> implements HadoopOutputAdapter {
    static final Logger LOG = Logger.getLogger(DefaultHadoopOutputAdapter.class.getName());

    protected static final AtomicInteger jobCounter = new AtomicInteger(0);

    protected OutputFormat<?, ?> oFormat;

    protected InitializableConfSetter configurator;

    protected KeyValueExport<K, V> converter;

    protected JobConf conf;

    protected BufferedJsonRecord args; // original arguments

    protected String location;

    protected BufferedJsonRecord options; // options from arguments unioned with
    // defaults in registry

    protected Progressable reporter;

    protected RecordWriter<JsonHolder, JsonHolder> writer = null;

    JobContext sequentialJob; // job context for sequential mode
    protected TaskAttemptContext sequentialTask; // task attempt for sequential mode

    @Override
    public void init(JsonValue args) throws Exception {
        init((JsonRecord) args);
    }

    /**
     * @param args
     * @throws Exception
     */
    @SuppressWarnings("unchecked")
    protected void init(JsonRecord args) throws Exception {
        this.args = new BufferedJsonRecord(args.size());
        this.args.setCopy(args);

        // set the location
        this.location = AdapterStore.getStore().getLocation(args);

        // set the options
        this.options = AdapterStore.getStore().output.getOption(args);

        // set the format
        this.oFormat = (OutputFormat<?, ?>) AdapterStore.getStore().getClassFromRecord(options, FORMAT_NAME, null)
                .newInstance();

        // set the configurator
        Class<?> configuratorClass = AdapterStore.getStore().getClassFromRecord(options, CONFIGURATOR_NAME, null);
        if (configuratorClass != null) {
            this.configurator = (InitializableConfSetter) configuratorClass.newInstance();
            this.configurator.init(args);
        }

        // set the converter
        Class<?> converterClass = AdapterStore.getStore().getClassFromRecord(options, CONVERTER_NAME, null);
        if (converterClass != null) {
            this.converter = (KeyValueExport<K, V>) converterClass.newInstance();
            this.converter.init(options);
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.lang.StorableAdapter#open()
     */
    public void open() throws Exception {
        this.conf = new JobConf();
        this.reporter = Reporter.NULL;

        // Some OutputFormats (like FileOutputFormat) require that the job id/task id set.
        // So let's set it for all output formats, just in case they need it too.
        JobID jobid = new JobID("sequential", jobCounter.getAndIncrement());
        TaskAttemptID taskid = new TaskAttemptID(new TaskID(jobid, true, 0), 0);
        conf.set("mapred.task.id", taskid.toString());

        setSequential(conf);

        // Create a task so we can use committers.
        sequentialJob = new ExposeJobContext(conf, jobid);
        sequentialTask = new ExposeTaskAttemptContext(conf, taskid);

        // Give the commiter a chance initialize.
        OutputCommitter committer = conf.getOutputCommitter();
        // FIXME: We skip job setup for now because  
        committer.setupJob(sequentialJob);
        committer.setupTask(sequentialTask);

        if (oFormat instanceof JobConfigurable)
            ((JobConfigurable) oFormat).configure(conf);
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.lang.StorableAdapter#close()
     */
    public void close() throws Exception {
        if (writer != null) {
            writer.close((Reporter) reporter);
            writer = null;
            OutputCommitter committer = conf.getOutputCommitter();
            committer.commitTask(sequentialTask);
            // FIXME: We skip the job cleanup because the FileOutputCommitter deletes the _temporary, which
            // is shared by all sequential jobs.
            committer.cleanupJob(sequentialJob);
            sequentialTask = null;
            //      
            //      if (committer instanceof FileOutputCommitter) {
            //       
            //        // for this case, only one file is expected
            //        String fileName = new Path(location).getName();
            //        Path pTgt = null;
            //        Path src = null;
            //        try {
            //          pTgt = FileOutputFormat.getOutputPath(conf);
            //          src = FileOutputFormat.getTaskOutputPath(conf, fileName);
            //        } catch(Exception e) {
            //          // TODO: this can happen if the OutputFormat is not a FileOutputFormat,
            //          // i.e., for HBase
            //          LOG.warn("task output files not found");
            //        }
            //        if(pTgt != null && src != null) {
            //          Path tgt = new Path(FileOutputFormat.getOutputPath(conf), fileName);
            //
            //          FileSystem fs = src.getFileSystem(conf);
            //          if(fs.exists(tgt)) {
            //            fs.delete(tgt, true);
            //          }
            //
            //
            //          // rename src to tgt
            //          fs.rename(src, tgt);
            //
            //          // clean-up the temp
            //          Path tmp = new Path(FileOutputFormat.getOutputPath(conf), FileOutputCommitter.TEMP_DIR_NAME);
            //          if(fs.exists(tmp))
            //            fs.delete(tmp, true);
            //        }
            //      }
        }
    }

    protected JsonHolder keyHolder() {
        return new JsonHolderDefault();
    }

    protected JsonHolder valueHolder() {
        return new JsonHolderDefault();
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.lang.OutputAdapter#getRecordWriter()
     */
    public ClosableJsonWriter getWriter() throws Exception {
        Path lPath = new Path(location);

        final RecordWriter<JsonHolder, JsonHolder> writer = getRecordWriter(FileSystem.get(conf), conf,
                lPath.getName(), reporter);
        return new ClosableJsonWriter() {
            JsonHolder keyHolder = keyHolder();
            JsonHolder valueHolder = valueHolder();

            public void flush() throws IOException {
                // TODO: hmm... RecordWriter has no flush method... 
            }

            public void close() throws IOException {
                writer.close(null);
            }

            public void write(JsonValue value) throws IOException {
                valueHolder.value = value;
                writer.write(keyHolder, valueHolder);
            }
        };
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.mapred.OutputFormat#getRecordWriter(org.apache.hadoop.fs.FileSystem,
     *      org.apache.hadoop.mapred.JobConf, java.lang.String,
     *      org.apache.hadoop.util.Progressable)
     */
    @SuppressWarnings("unchecked")
    public RecordWriter<JsonHolder, JsonHolder> getRecordWriter(FileSystem ignored, JobConf job, String name,
            Progressable progress) throws IOException {
        if (converter == null) {
            final RecordWriter<JsonHolder, JsonHolder> baseWriter = ((OutputFormat<JsonHolder, JsonHolder>) oFormat)
                    .getRecordWriter(ignored, job, name, progress);
            final JsonHolder nullHolder = keyHolder();
            writer = new RecordWriter<JsonHolder, JsonHolder>() {
                public void close(Reporter reporter) throws IOException {
                    baseWriter.close(reporter);
                }

                public void write(JsonHolder key, JsonHolder value) throws IOException {
                    baseWriter.write(nullHolder, value); // key is unused
                }
            };
        } else {
            final RecordWriter<K, V> baseWriter = ((OutputFormat<K, V>) oFormat).getRecordWriter(ignored, job, name,
                    progress);

            final K baseKey = converter.createKeyTarget();
            final V baseValue = converter.createValueTarget();

            writer = new RecordWriter<JsonHolder, JsonHolder>() {

                public void close(Reporter reporter) throws IOException {
                    baseWriter.close(reporter);
                }

                public void write(JsonHolder key, JsonHolder value) throws IOException {
                    converter.convert(value.value, baseKey, baseValue);
                    baseWriter.write(baseKey, baseValue);
                }
            };
        }
        return writer;
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.mapred.OutputFormat#checkOutputSpecs(org.apache.hadoop.fs.FileSystem,
     *      org.apache.hadoop.mapred.JobConf)
     */
    public void checkOutputSpecs(FileSystem ignored, JobConf job) throws IOException {
        oFormat.checkOutputSpecs(ignored, job);
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf)
     */
    public void configure(JobConf conf) {
        Globals.setJobConf(conf);
        if (oFormat == null) {
            try {
                JsonRecord r = ConfUtil.readConf(conf, ConfSetter.CONFOUTOPTIONS_NAME);
                init(r);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }
        if (oFormat instanceof JobConfigurable) {
            ((JobConfigurable) oFormat).configure(conf);
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.io.hadoop.ConfSetter#setSequential(org.apache.hadoop.mapred.JobConf)
     */
    public void setSequential(JobConf conf) throws Exception {
        set(conf);
        configurator.setSequential(conf);
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.io.hadoop.ConfSetter#setParallel(org.apache.hadoop.mapred.JobConf)
     */
    public void setParallel(JobConf conf) throws Exception {
        set(conf);
        configurator.setParallel(conf);
    }

    /**
     * @param conf
     * @throws Exception
     */
    protected void set(JobConf conf) throws Exception {
        conf.setOutputFormat(this.getClass());
        // add options to args
        AdapterStore.getStore().output.replaceOption(args, options);
        // Add any conf overrides from the options into the conf
        ConfUtil.writeConfOptions(conf, options);
        // write out args and options to the conf
        ConfUtil.writeConf(conf, ConfSetter.CONFOUTOPTIONS_NAME, args);
        Globals.setJobConf(conf);
    }

    @Override
    public JsonValue expand() throws Exception {
        BufferedJsonRecord jr = (BufferedJsonRecord) this.options.getCopy(null);
        BufferedJsonArray ja = new BufferedJsonArray();
        String namenode = null;
        String dfsport = null;
        if (this.location != null) {
            Configuration conf = new Configuration();
            namenode = conf.get("fs.default.name");
            dfsport = conf.get("dfs.namenode.http-address");
            FileSystem fs = FileSystem.get(conf);
            FileStatus files = fs.getFileStatus(new Path(this.location));

            if (files.isDir()) {
                StringBuilder sb = new StringBuilder();
                FileStatus[] dirContent = fs.listStatus(new Path(this.location));
                for (FileStatus file : dirContent) {
                    if (!file.isDir()) {
                        ja.add(new JsonString(file.getPath().toString()));
                    }
                }
            } else {
                ja.add(new JsonString(files.getPath().toString()));
            }
        }
        jr.add(Adapter.LOCATION_NAME, ja);
        jr.add(Adapter.TYPE_NAME, args.get(Adapter.TYPE_NAME));
        jr.add(new JsonString("expanded"), JsonBool.make(true));

        if (namenode != null)
            jr.add(new JsonString("fs.default.name"), new JsonString(namenode));
        if (dfsport != null)
            jr.add(new JsonString("dfs.namenode.http-address"), new JsonString(dfsport));

        return jr;
    }
}