cascading.scheme.DeprecatedAvroScheme.java Source code

Introduction

Here is the source code for cascading.scheme.DeprecatedAvroScheme.java
Source

/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package cascading.scheme;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.IndexedRecord;
import org.apache.avro.mapred.AvroJob;
import org.apache.avro.mapred.AvroSerialization;
import org.apache.avro.mapred.AvroWrapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;

import cascading.avro.AvroToCascading;
import cascading.avro.CascadingToAvro;
import cascading.avro.serialization.AvroSpecificRecordSerialization;
import cascading.flow.FlowProcess;
import cascading.tap.CompositeTap;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;

/**
 * 
 * NOTE: This is originated from cascading.avro.AvroScheme, some modification has been made to conform to Cascading 3.0 API. 
 *
 */

public class DeprecatedAvroScheme extends Scheme<Configuration, RecordReader, OutputCollector, Object[], Object[]> {

    private static final String DEFAULT_RECORD_NAME = "CascadingAvroRecord";
    private static final PathFilter filter = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }
    };
    Schema schema;

    /**
     * Constructor to read from an Avro source or write to an Avro sink without specifying the schema. If using as a sink,
     * the sink Fields must have type information and currently Map and List are not supported.
     */
    public DeprecatedAvroScheme() {
        this(null);
    }

    /**
     * Create a new Cascading 2.0 scheme suitable for reading and writing data using the Avro serialization format.
     * This is the legacy constructor format. A Fields object and the corresponding types must be provided.
     *
     * @param fields Fields object from cascading
     * @param types  array of Class types
     */
    public DeprecatedAvroScheme(Fields fields, Class<?>[] types) {
        this(CascadingToAvro.generateAvroSchemaFromFieldsAndTypes(DEFAULT_RECORD_NAME, fields, types));
    }

    /**
     * Create a new Cascading 2.0 scheme suitable for reading and writing data using the Avro serialization format.
     * Note that if schema is null, the Avro schema will be inferred from one of the source files (if this scheme
     * is being used as a source). At the moment, we are unable to infer a schema for a sink (this will change soon with
     * a new version of cascading though).
     *
     * @param schema Avro schema, or null if this is to be inferred from source file. Note that a runtime exception will happen
     *               if the AvroScheme is used as a sink and no schema is supplied.
     */
    public DeprecatedAvroScheme(Schema schema) {
        this.schema = schema;

        if (schema == null) {
            setSinkFields(Fields.ALL);
            setSourceFields(Fields.UNKNOWN);
        } else {
            Fields cascadingFields = new Fields();
            for (Field avroField : schema.getFields()) {
                cascadingFields = cascadingFields.append(new Fields(avroField.name()));
            }
            setSinkFields(cascadingFields);
            setSourceFields(cascadingFields);
        }
    }

    /**
     * Helper method to read in a schema when de-serializing the object
     *
     * @param in The ObjectInputStream containing the serialized object
     * @return Schema The parsed schema.
     */
    protected static Schema readSchema(java.io.ObjectInputStream in) throws IOException {
        final Schema.Parser parser = new Schema.Parser();
        try {
            return parser.parse(in.readObject().toString());
        } catch (ClassNotFoundException cce) {
            throw new RuntimeException("Unable to read schema which is expected to be written as a java string",
                    cce);
        }
    }

    /**
     * Return the schema which has been set as a string
     *
     * @return String representing the schema
     */
    String getJsonSchema() {
        if (schema == null) {
            return "";
        } else {
            return schema.toString();
        }
    }

    /**
     * Sink method to take an outgoing tuple and write it to Avro.
     *
     * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically.
     * @param sinkCall    The cascading SinkCall object. Should be passed in by cascading automatically.
     * @throws IOException
     */
    @Override
    public void sink(FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall)
            throws IOException {
        TupleEntry tupleEntry = sinkCall.getOutgoingEntry();

        IndexedRecord record = new Record((Schema) sinkCall.getContext()[0]);
        Object[] objectArray = CascadingToAvro.parseTupleEntry(tupleEntry, (Schema) sinkCall.getContext()[0]);
        for (int i = 0; i < objectArray.length; i++) {
            record.put(i, objectArray[i]);
        }
        //noinspection unchecked
        sinkCall.getOutput().collect(new AvroWrapper<IndexedRecord>(record), NullWritable.get());

    }

    /**
     * Sink prepare method called by cascading once on each reducer. This method stuffs the schema into a context
     * for easy access by the sink method.
     *
     * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically.
     * @param sinkCall    The cascading SinkCall object. Should be passed in by cascading automatically.
     * @throws IOException
     */
    @Override
    public void sinkPrepare(FlowProcess<? extends Configuration> flowProcess,
            SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
        sinkCall.setContext(new Object[] { schema });

    }

    /**
     * sinkConfInit is called by cascading to set up the sinks. This happens on the client side before the
     * job is distributed.
     * There is a check for the presence of a schema and an exception is thrown if none has been provided.
     * After the schema check the conf object is given the options that Avro needs.
     *
     * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically.
     * @param tap         The cascading Tap object. Should be passed in by cascading automatically.
     * @param conf        The Hadoop JobConf object. This is passed in by cascading automatically.
     * @throws RuntimeException If no schema is present this halts the entire process.
     */
    @Override
    public void sinkConfInit(FlowProcess<? extends Configuration> flowProcess,
            Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf) {

        if (schema == null) {
            throw new RuntimeException("Must provide sink schema");
        }
        // Set the output schema and output format class
        conf.set(AvroJob.OUTPUT_SCHEMA, schema.toString());
        conf.set("mapred.output.format.class", "org.apache.avro.mapred.AvroOutputFormat");

        // add AvroSerialization to io.serializations
        addAvroSerializations(conf);
    }

    /**
     * This method is called by cascading to set up the incoming fields. If a schema isn't present then it will
     * go and peek at the input data to retrieve one. The field names from the schema are used to name the cascading fields.
     *
     * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically.
     * @param tap         The cascading Tap object. Should be passed in by cascading automatically.
     * @return Fields The source cascading fields.
     */
    @Override
    public Fields retrieveSourceFields(FlowProcess<? extends Configuration> flowProcess, Tap tap) {
        if (schema == null) {
            try {
                schema = getSourceSchema(flowProcess, tap);
            } catch (IOException e) {
                throw new RuntimeException("Can't get schema from data source");
            }
        }
        Fields cascadingFields = new Fields();
        if (schema.getType().equals(Schema.Type.NULL)) {
            cascadingFields = Fields.NONE;
        } else {
            for (Field avroField : schema.getFields())
                cascadingFields = cascadingFields.append(new Fields(avroField.name()));
        }
        setSourceFields(cascadingFields);
        return getSourceFields();
    }

    /**
     * Source method to take an incoming Avro record and make it a Tuple.
     *
     * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically.
     * @param sourceCall  The cascading SourceCall object. Should be passed in by cascading automatically.
     * @return boolean true on successful parsing and collection, false on failure.
     * @throws IOException
     */
    @Override
    public boolean source(FlowProcess<? extends Configuration> flowProcess,
            SourceCall<Object[], RecordReader> sourceCall) throws IOException {

        @SuppressWarnings("unchecked")
        RecordReader<AvroWrapper<IndexedRecord>, Writable> input = sourceCall.getInput();
        AvroWrapper<IndexedRecord> wrapper = input.createKey();
        if (!input.next(wrapper, input.createValue())) {
            return false;
        }
        IndexedRecord record = wrapper.datum();
        Tuple tuple = sourceCall.getIncomingEntry().getTuple();
        tuple.clear();

        Object[] split = AvroToCascading.parseRecord(record, schema);
        tuple.addAll(split);

        return true;
    }

    /**
     * sourceConfInit is called by cascading to set up the sources. This happens on the client side before the
     * job is distributed.
     * There is a check for the presence of a schema and if none has been provided the data is peeked at to get a schema.
     * After the schema check the conf object is given the options that Avro needs.
     *
     * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically.
     * @param tap         The cascading Tap object. Should be passed in by cascading automatically.
     * @param conf        The Hadoop JobConf object. This is passed in by cascading automatically.
     * @throws RuntimeException If no schema is present this halts the entire process.
     */
    @Override
    public void sourceConfInit(FlowProcess<? extends Configuration> flowProcess,
            Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf) {

        retrieveSourceFields(flowProcess, tap);
        // Set the input schema and input class
        conf.set(AvroJob.INPUT_SCHEMA, schema.toString());
        conf.set("mapred.input.format.class", "org.apache.avro.mapred.AvroInputFormat");

        // add AvroSerialization to io.serializations
        addAvroSerializations(conf);
    }

    /**
     * This method peeks at the source data to get a schema when none has been provided.
     *
     * @param flowProcess The cascading FlowProcess object for this flow.
     * @param tap         The cascading Tap object.
     * @return Schema The schema of the peeked at data, or Schema.NULL if none exists.
     */
    private Schema getSourceSchema(FlowProcess<? extends Configuration> flowProcess, Tap tap) throws IOException {

        if (tap instanceof CompositeTap) {
            tap = (Tap) ((CompositeTap) tap).getChildTaps().next();
        }
        final String path = tap.getIdentifier();
        Path p = new Path(path);
        final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy());
        // Get all the input dirs
        List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs.globStatus(p, filter)));
        // Now get all the things that are one level down
        for (FileStatus status : new LinkedList<FileStatus>(statuses)) {
            if (status.isDir())
                for (FileStatus child : Arrays.asList(fs.listStatus(status.getPath(), filter))) {
                    if (child.isDir()) {
                        statuses.addAll(Arrays.asList(fs.listStatus(child.getPath(), filter)));
                    } else if (fs.isFile(child.getPath())) {
                        statuses.add(child);
                    }
                }
        }
        for (FileStatus status : statuses) {
            Path statusPath = status.getPath();
            if (fs.isFile(statusPath)) {
                // no need to open them all
                InputStream stream = null;
                DataFileStream reader = null;
                try {
                    stream = new BufferedInputStream(fs.open(statusPath));
                    reader = new DataFileStream(stream, new GenericDatumReader());
                    return reader.getSchema();
                } finally {
                    if (reader == null) {
                        if (stream != null) {
                            stream.close();
                        }
                    } else {
                        reader.close();
                    }
                }

            }
        }
        // couldn't find any Avro files, return null schema
        return Schema.create(Schema.Type.NULL);
    }

    private void addAvroSerializations(Configuration conf) {
        Collection<String> serializations = conf.getStringCollection("io.serializations");
        if (!serializations.contains(AvroSerialization.class.getName())) {
            serializations.add(AvroSerialization.class.getName());
            serializations.add(AvroSpecificRecordSerialization.class.getName());
        }

        conf.setStrings("io.serializations", serializations.toArray(new String[serializations.size()]));
    }

    private void writeObject(java.io.ObjectOutputStream out) throws IOException {
        out.writeObject(this.schema.toString());
    }

    private void readObject(java.io.ObjectInputStream in) throws IOException {
        this.schema = readSchema(in);
    }

    @Override
    public boolean equals(Object o) {
        if (this == o)
            return true;
        if (o == null || getClass() != o.getClass())
            return false;
        if (!super.equals(o))
            return false;

        DeprecatedAvroScheme that = (DeprecatedAvroScheme) o;

        if (schema != null ? !schema.equals(that.schema) : that.schema != null)
            return false;

        return true;
    }

    @Override
    public String toString() {
        return "DeprecatedAvroScheme{" + "schema=" + schema + '}';
    }

    @Override
    public int hashCode() {

        return 31 * getSinkFields().hashCode() + (schema == null ? 0 : schema.hashCode());
    }
}