com.ibm.jaql.io.hadoop.DefaultHadoopInputAdapter.java Source code

Introduction

Here is the source code for com.ibm.jaql.io.hadoop.DefaultHadoopInputAdapter.java
Source

/*
 * Copyright (C) IBM Corp. 2008.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.ibm.jaql.io.hadoop;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.log4j.Logger;

import com.ibm.jaql.io.AdapterStore;
import com.ibm.jaql.io.ClosableJsonIterator;
import com.ibm.jaql.io.hadoop.converter.KeyValueImport;
import com.ibm.jaql.json.schema.ArraySchema;
import com.ibm.jaql.json.schema.Schema;
import com.ibm.jaql.json.schema.SchemaFactory;
import com.ibm.jaql.json.type.BufferedJsonRecord;
import com.ibm.jaql.json.type.JsonRecord;
import com.ibm.jaql.json.type.JsonString;
import com.ibm.jaql.json.type.JsonValue;
import com.ibm.jaql.lang.util.JaqlUtil;

/**
 * The default class for reading data from Hadoop into jaql
 */
public class DefaultHadoopInputAdapter<K, V> implements HadoopInputAdapter {
    static final Logger LOG = Logger.getLogger(DefaultHadoopInputAdapter.class.getName());

    public static final JsonString PRESERVE_ORDER_OPTION = new JsonString("ordered");

    protected InputFormat<K, V> iFormat;

    protected InitializableConfSetter configurator;

    protected KeyValueImport<K, V> converter;

    protected JobConf conf;

    protected Reporter reporter;

    protected BufferedJsonRecord args;

    protected String location;

    protected BufferedJsonRecord options;

    public void init(JsonValue args) throws Exception {
        initializeFrom((JsonRecord) args);
    }

    /**
     * @param args
     * @throws Exception
     */
    @SuppressWarnings("unchecked")
    protected void initializeFrom(JsonRecord args) throws Exception {
        this.args = (BufferedJsonRecord) args; // FIXME: shouldn't this be just (JsonRecord)? Who owns it? (ksb)

        // set the location
        this.location = AdapterStore.getStore().getLocation(args);

        // set the options
        this.options = AdapterStore.getStore().input.getOption(args);

        // set the format
        this.iFormat = (InputFormat<K, V>) AdapterStore.getStore().getClassFromRecord(options, FORMAT_NAME, null)
                .newInstance();

        // set the configurator
        Class<?> configuratorClass = AdapterStore.getStore().getClassFromRecord(options, CONFIGURATOR_NAME, null);
        if (configuratorClass != null) {
            this.configurator = (InitializableConfSetter) configuratorClass.newInstance();
            this.configurator.init(args); // FIXME: no need to "new"
        }

        // set the converter
        Class<?> converterClass = AdapterStore.getStore().getClassFromRecord(options, CONVERTER_NAME, null);
        if (converterClass != null) {
            this.converter = (KeyValueImport<K, V>) converterClass.newInstance();
            this.converter.init(options);
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.lang.StorableAdapter#open()
     */
    public void open() throws Exception {
        this.conf = new JobConf();
        if (Globals.getJobConf() != null)
            conf.setWorkingDirectory(Globals.getJobConf().getWorkingDirectory());

        this.reporter = Reporter.NULL;

        // write state to conf, pass in top-level args
        setSequential(conf);
        Globals.setJobConf(conf);
        // initialize the format from conf
        if (iFormat instanceof JobConfigurable)
            ((JobConfigurable) iFormat).configure(conf);
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.lang.StorableAdapter#close()
     */
    public void close() throws Exception {
        // do nothing
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.mapred.InputFormat#getSplits(org.apache.hadoop.mapred.JobConf,
     *      int)
     */
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
        InputSplit[] inSplits = iFormat.getSplits(job, numSplits);
        if (JaqlUtil.ebv(this.options.get(PRESERVE_ORDER_OPTION))) {
            // TODO: only do if order preserving?  Need to fix fileSplitToRecord
            InputSplit[] splits = new InputSplit[inSplits.length];
            for (int i = 0; i < splits.length; i++) {
                splits[i] = new DHIASplit(inSplits[i], i);
            }
            return splits;
        }
        return inSplits;
    }

    /**
     * A wrapper for the real input split.  At this point, this wrapper is solely used to prevent
     * JobClient.writeNewSplits() from reordering our splits because it makes map jobs non-order preserving.
     */
    public static class DHIASplit implements InputSplit {
        protected InputSplit split;
        protected int index;

        public DHIASplit() {
        }

        public DHIASplit(InputSplit split, int index) {
            this.split = split;
            this.index = index;
        }

        @Override
        public long getLength() throws IOException {
            // We lie to hadoop so it doesn't reorder our tasks by size.  We prefer require
            // order preserved.
            // TODO: open two bugs against hadoop:
            //   1. JobClient.writeNewSplits should NOT reorder splits. Instead the job scheduler
            //      could schedule tasks by size if it so chooses.
            //   2. RawLocalFileSystem.listStatus should sort files returned by File.list() by name
            //      to preserve order.
            return Integer.MAX_VALUE - index;
        }

        @Override
        public String[] getLocations() throws IOException {
            return split.getLocations();
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            String cname = in.readUTF();
            try {
                Class<? extends InputSplit> cls = Class.forName(cname).asSubclass(InputSplit.class);
                split = (InputSplit) ReflectionUtils.newInstance(cls, null);
            } catch (Exception e) {
                throw new IOException("Failed split init", e);
            }
            index = in.readInt();
            split.readFields(in);
        }

        @Override
        public void write(DataOutput out) throws IOException {
            out.writeUTF(split.getClass().getName());
            out.writeInt(index);
            split.write(out);
        }
    }

    /**
     * 
     */
    private class SplitState {
        int splitId = 0;

        RecordReader<JsonHolder, JsonHolder> reader = null;
    };

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.lang.InputAdapter#getRecordReader()
     */
    public ClosableJsonIterator iter() throws IOException {
        final SplitState state = new SplitState();
        final InputSplit[] splits = getSplits(conf, conf.getNumMapTasks());

        final JsonHolder valueHolder = valueHolder();
        valueHolder.value = converter != null ? converter.createTarget() : null;
        return new ClosableJsonIterator() {
            JsonHolder key;

            /*
             * (non-Javadoc)
             * 
             * @see com.ibm.jaql.io.ItemReader#close()
             */
            @Override
            public void close() throws IOException {
                if (state.reader != null) {
                    state.reader.close();
                }
            }

            /*
             * (non-Javadoc)
             * 
             * @see com.ibm.jaql.io.ItemReader#next(com.ibm.jaql.json.type.Item)
             */
            public boolean moveNext() throws IOException {
                while (true) {
                    if (state.reader == null) {
                        if (state.splitId >= splits.length) {
                            close();
                            return false;
                        }
                        InputSplit split = splits[state.splitId++];
                        state.reader = getRecordReader(split, conf, reporter);
                        if (key == null)
                            key = state.reader.createKey();
                    }
                    if (state.reader.next(key, valueHolder)) {
                        currentValue = valueHolder.value;
                        return true;
                    }
                    state.reader.close();
                    state.reader = null;
                }
            }
        };
    }

    protected JsonHolder keyHolder() {
        return new JsonHolderDefault();
    }

    protected JsonHolder valueHolder() {
        return new JsonHolderDefault();
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.mapred.InputFormat#getRecordReader(org.apache.hadoop.mapred.InputSplit,
     *      org.apache.hadoop.mapred.JobConf, org.apache.hadoop.mapred.Reporter)
     */
    @SuppressWarnings("unchecked")
    public RecordReader<JsonHolder, JsonHolder> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
            throws IOException {
        if (split instanceof DHIASplit) {
            // not using order-preserving wrapper
            split = ((DHIASplit) split).split;
        }

        if (converter == null)
            return ((InputFormat<JsonHolder, JsonHolder>) iFormat).getRecordReader(split, job, reporter);
        final RecordReader<K, V> baseReader = ((InputFormat<K, V>) iFormat).getRecordReader(split, job, reporter);
        final K baseKey = baseReader.createKey();
        final V baseValue = baseReader.createValue();

        return new RecordReader<JsonHolder, JsonHolder>() {

            public void close() throws IOException {
                baseReader.close();
            }

            public JsonHolder createKey() {
                return keyHolder();
            }

            public JsonHolder createValue() {
                JsonHolder holder = valueHolder();
                holder.value = converter.createTarget();
                return holder;
            }

            public long getPos() throws IOException {
                return baseReader.getPos();
            }

            public float getProgress() throws IOException {
                return baseReader.getProgress();
            }

            public boolean next(JsonHolder key, JsonHolder value) throws IOException {
                boolean hasMore = baseReader.next(baseKey, baseValue);
                if (!hasMore)
                    return false;
                value.value = converter.convert(baseKey, baseValue, value.value);
                return true;
            }
        };
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf)
     */
    public void configure(JobConf conf) {
        Globals.setJobConf(conf);
        // TODO: factor this configuration code so that it can be shared with the
        // composite input format...
        // setup the internal input format
        if (iFormat == null) {
            try {
                JsonRecord options = ConfUtil.readConf(conf, ConfSetter.CONFINOPTIONS_NAME);
                initializeFrom(options);
            } catch (Exception e) {
                throw new RuntimeException("Could not initialize InputAdapter from JobConf", e);
            }
        }
        if (iFormat instanceof JobConfigurable) {
            ((JobConfigurable) iFormat).configure(conf);
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.io.hadoop.ConfSetter#setSequential(org.apache.hadoop.mapred.JobConf)
     */
    public void setSequential(JobConf conf) throws Exception {
        set(conf);

        // write the optional args for the configurator
        if (configurator != null) {
            configurator.setSequential(conf); // TODO: double-check what options the
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.ibm.jaql.io.hadoop.ConfSetter#setParallel(org.apache.hadoop.mapred.JobConf)
     */
    public void setParallel(JobConf conf) throws Exception {
        set(conf);

        if (configurator != null) {
            // write the optional args for the configurator
            configurator.setParallel(conf); // TODO: double-check what options the
        }
    }

    /**
     * @param conf
     * @throws Exception
     */
    protected void set(JobConf conf) throws Exception {
        // TODO: refactor so that its shared with composite inputformat...
        // set the input format to this adapter
        conf.setInputFormat(this.getClass());

        // replace the options record in args with the options override
        AdapterStore.getStore().input.replaceOption(args, options);

        // Add any conf overrides from the options into the conf
        ConfUtil.writeConfOptions(conf, options);

        // write the top-level args for the adapter
        ConfUtil.writeConf(conf, ConfSetter.CONFINOPTIONS_NAME, args);

        // Set the global options
        Globals.setJobConf(conf);
    }

    @Override
    public Schema getSchema() {
        if (converter != null) // input file is already Json
        {
            return new ArraySchema(null, converter.getSchema());
        } else {
            return SchemaFactory.arraySchema();
        }
    }

}