org.apache.pig.piggybank.storage.XMLLoaderBufferedPositionedInputStream.java Source code

Introduction

Here is the source code for org.apache.pig.piggybank.storage.XMLLoaderBufferedPositionedInputStream.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.pig.piggybank.storage;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.io.BufferedPositionedInputStream;
import org.apache.tools.bzip2r.CBZip2InputStream;

/**
 * A <code>XMLLoaderBufferedPositionedInputStream</code> is the package class and is the 
 * decorator over the BufferedPositionedInputStream which in turn decorate
 * BufferedInputStream. It contains <code>BufferedPositionedInputStream<code>
 * input stream, which it uses as
 * its  basic source of data, possibly reading or providing  additional
 * functionality. The class <code>XMLLoaderBufferedPositionedInputStream</code>
 * itself simply overrides the necessary medthod for reading i.e 
 * <code>read</code> <code>getPosition<code> with versions that
 * pass all requests to the contained  input
 * stream or do some special processing. Subclasses of <code>XMLLoaderBufferedPositionedInputStream</code>
 * may further override some of  these methods
 * and may also provide additional methods
 * and fields.
 * It also provides additional method <code>collectTag<collect> which will give the byte 
 * array between the tag which is a xml record. i.e <tag> .*</tag> will be returned
 *
 * @note we can't use the standard SAX or STAX parser as for a big xml 
 *       the intermittent hadoop block may not be the valid xml and hence those
 *       parser may create pb. 
 *
 * @since   pig 2.0
 */

class XMLLoaderBufferedPositionedInputStream extends BufferedPositionedInputStream {

    public final static int S_START = 0;
    public final static int S_MATCH_PREFIX = 1;
    public final static int S_MATCH_TAG = 2;

    /**
     * The input streamed to be filtered 
     */
    InputStream wrapperIn;

    /**
     * The field to know if the underlying buffer contains any more bytes
     */
    boolean _isReadable;

    /**
    * The field set the maximum bytes that is readable by this instance of stream.
    */
    private long maxBytesReadable = 0;

    /**
    * The field denote the number of bytes read by this stream. 
    */
    long bytesRead = 0;

    /**
    * Denotes the end of the current split location
    */
    long end = 0;

    /**
     * Creates a <code>XMLLoaderBufferedPositionedInputStream</code>
     * by assigning the  argument <code>in</code>
     * to the field <code>this.wrapperIn</code> so as
     * to remember it for later use.
     *
     * @param   in   the underlying input stream,
     */
    public XMLLoaderBufferedPositionedInputStream(InputStream in) {
        super(in);
        this.wrapperIn = in;
        setReadable(true);
    }

    /**
     * Creates a  split aware <code>XMLLoaderBufferedPositionedInputStream</code>.
     * @param in    the underlying input stream
     * @param start    start location of the split
     * @param end    end location of the split
     */
    public XMLLoaderBufferedPositionedInputStream(InputStream in, long start, long end) {
        this(in);
        this.end = end;
        maxBytesReadable = end - start;
    }

    /**
     * Set the stream readable or non readable. This is needed
     * to control the xml parsing.
     * @param flag The boolean flag to be set
     * @see XMLLoaderBufferedPositionedInputStream#isReadable
     */
    private void setReadable(boolean flag) {
        _isReadable = flag;
    }

    /**
     * See if the stream readable or non readable. This is needed
     * to control the xml parsing.
     * @return  true if readable otherwise false
     * @see XMLLoaderBufferedPositionedInputStream#setReadable
     */
    public boolean isReadable() {
        return _isReadable == true;
    }

    /**
     * org.apache.pig.impl.io.BufferedPositionedInputStream.read
     * It is just the wrapper for now.
     * Reads the next byte of data from this input stream. The value
     * byte is returned as an <code>int</code> in the range
     * <code>0</code> to <code>255</code>. If no byte is available
     * because the end of the stream has been reached, the value
     * <code>-1</code> is returned. This method blocks until input data
     * is available, the end of the stream is detected, or an exception
     * is thrown.
     * <p>
     * This method
     * simply performs <code>in.read()</code> and returns the result.
     *
     * @return     the next byte of data, or <code>-1</code> if the end of the
     *             stream is reached.
     * @exception  IOException  if an I/O error occurs.
     * @see        XMLLoaderBufferedPositionedInputStreamInputStream#wrapperIn
     */
    public int read() throws IOException {
        return wrapperIn.read();
    }

    /**
     * This is collect the bytes from current position to the ending tag.
     * This scans for the tags and do the pattern match byte by byte
     * this must be used along with 
     *  XMLLoaderBufferedPositionedInputStream#skipToTag
     *
     * @param tagName the end tag to search for
     *
     * @param limit the end pointer for the block for this mapper
     *
     * @return the byte array containing the documents until the end of tag
     *
     * @see loader.XMLLoaderBufferedPositionedInputStream.collectUntilEndTag
     *
     */
    private byte[] collectUntilEndTag(String tagName, long limit) {

        //@todo use the charset and get the charset encoding from the xml encoding.
        byte[] tmp = tagName.getBytes();
        ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024);
        // Levels of elements we went inside matched node
        int depth = 0;

        //Since skipToTag was called before this function, we know that we are
        //currently inside the matched tag. Assuming the XML file is well
        //structured, we read till we encounter the first close tag. Since
        //the matched element might contain nested element, we keep track of the
        //current depth and terminate only when we encounter a closing tag at
        //level zero

        // A flag to indicate the parsing is currently inside a (start/end) tag
        boolean insideTag = false;
        // A flag to indicate that the current tag is a closing (end) tag
        boolean closingTag = false;

        // Last byte read
        int last_b = -1;
        while (true) {
            int b = -1;
            try {
                b = this.read();
                ++bytesRead; // Add one to the bytes read
                if (b == -1) {
                    collectBuf.reset();
                    this.setReadable(false);
                    break;
                }
                collectBuf.write((byte) (b));

                // Check if the start tag has matched except for the last char
                if (b == '<') {
                    insideTag = true;
                    closingTag = false;
                } else if (b == '>') {
                    // Detect the pattern />
                    if (last_b == '/')
                        closingTag = true;
                    insideTag = false;
                    if (closingTag) {
                        if (depth == 0)
                            break;
                        depth--;
                    }
                } else if (b == '/' && last_b == '<') {
                    // Detected the pattern </
                    closingTag = true;
                } else if (insideTag && last_b == '<') {
                    // First character after '<' which is not a '/'
                    depth++;
                }
            } catch (IOException e) {
                this.setReadable(false);
                return null;
            }
            last_b = b;
        }
        return collectBuf.toByteArray();
    }

    /**
     * This is collect the from the matching tag.
     * This scans for the tags and do the pattern match byte by byte
     * This returns a part doc. it must be used along with 
     * XMLLoaderBufferedPositionedInputStream#collectUntilEndTag
     * 
     * @param tagName the start tag to search for
     *
     * @param limit the end pointer for the block for this mapper
     *
     * @return the byte array containing match of the tag.
     *
     * @see loader.XMLLoaderBufferedPositionedInputStream.collectUntilEndTag
     *
     */
    private byte[] skipToTag(String tagName, long limit) throws IOException {

        //@todo use the charset and get the charset encoding from the xml encoding.
        byte[] tmp = tagName.getBytes();
        byte[] tag = new byte[tmp.length + 1];
        tag[0] = (byte) '<';
        for (int i = 0; i < tmp.length; ++i) {
            tag[1 + i] = tmp[i];
        }

        ByteArrayOutputStream matchBuf = new ByteArrayOutputStream(512);
        int idxTagChar = 0;
        int state = S_START;

        /*
         * Read till the tag is found in this block. If a partial tag block is found
         * then continue on to the next block.matchBuf contains the data that is currently 
         * matched. If the read has reached the end of split and there are matched data 
         * then continue on to the next block.
         */
        while (splitBoundaryCriteria(wrapperIn) || (matchBuf.size() > 0)) {
            int b = -1;
            try {
                b = this.read();
                ++bytesRead; // Increment the bytes read by 1
                if (b == -1) {
                    state = S_START;
                    matchBuf.reset();
                    this.setReadable(false);
                    break;
                }
                switch (state) {
                case S_START:
                    // start to match the target open tag
                    if (b == tag[idxTagChar]) {
                        ++idxTagChar;
                        matchBuf.write((byte) (b));
                        if (idxTagChar == tag.length) {
                            state = S_MATCH_PREFIX;
                        }
                    } else { // mismatch
                        idxTagChar = 0;
                        matchBuf.reset();
                    }
                    break;
                case S_MATCH_PREFIX:
                    // tag match iff next character is whitespaces or close tag mark
                    if (Character.isWhitespace(b) || b == '/' || b == '>') {
                        matchBuf.write((byte) (b));
                        state = S_MATCH_TAG;
                    } else {
                        idxTagChar = 0;
                        matchBuf.reset();
                        state = S_START;
                    }
                    break;
                case S_MATCH_TAG:
                    // keep copy characters until we hit the close tag mark
                    matchBuf.write((byte) (b));
                    break;
                default:
                    throw new IllegalArgumentException("Invalid state: " + state);
                }
                if (state == S_MATCH_TAG && (b == '>' || Character.isWhitespace(b))) {
                    break;
                }
                if (state != S_MATCH_TAG && this.getPosition() > limit) {
                    // need to break, no record in this block
                    break;
                }
            } catch (IOException e) {
                this.setReadable(false);
                return null;
            }
        }
        return matchBuf.toByteArray();
    }

    /**
     * Returns whether the split boundary condition has reached or not.
     * For normal files ; the condition is to read till the split end reaches.
     * Gz files will have  maxBytesReadable set to near Long.MAXVALUE, hence
     * this will cause the entire file to be read. For bz2 and bz files, the 
     * condition lies on the position which until which it is read. 
     *  
     * @param wrapperIn2
     * @return true/false depending on whether split boundary has reached or no
     * @throws IOException
     */
    private boolean splitBoundaryCriteria(InputStream wrapperIn2) throws IOException {
        if (wrapperIn2 instanceof CBZip2InputStream)
            return ((CBZip2InputStream) wrapperIn2).getPos() <= end;
        else
            return bytesRead <= maxBytesReadable;
    }

    /**
     * This is collect bytes from start and end tag both inclusive
     * This scans for the tags and do the pattern match byte by byte
     * 
     * @param tagName the start tag to search for
     *
     * @param limit the end pointer for the block for this mapper
     *
     * @return the byte array containing match of the <code><tag>.*</tag><code>.
     *
     * @see loader.XMLLoaderBufferedPositionedInputStream.skipToTag
     *
     * @see loader.XMLLoaderBufferedPositionedInputStream.collectUntilEndTag
     *
     */
    byte[] collectTag(String tagName, long limit) throws IOException {
        ByteArrayOutputStream collectBuf = new ByteArrayOutputStream(1024);
        byte[] beginTag = skipToTag(tagName, limit);

        // Check if the tag is closed inline
        if (beginTag.length > 2 && beginTag[beginTag.length - 2] == '/' && beginTag[beginTag.length - 1] == '>') {
            return beginTag;
        }

        // No need to search for the end tag if the start tag is not found
        if (beginTag.length > 0) {
            byte[] untilTag = collectUntilEndTag(tagName, limit);
            if (untilTag.length > 0) {
                for (byte b : beginTag) {
                    collectBuf.write(b);
                }
                for (byte b : untilTag) {
                    collectBuf.write(b);
                }
            }
        }
        return collectBuf.toByteArray();
    }

}

/**
 * The load function to load the XML file
 * This implements the LoadFunc interface which is used to parse records
 * from a dataset. The various helper adaptor function is extended from loader.Utf8StorageConverter
 * which included various functions to cast raw byte data into various datatypes. 
 * other sections of the code can call back to the loader to do the cast.
 * This takes a xmlTag as the arg which it will use to split the inputdataset into
 * multiple records. 
 * <code>
 *    
 * For example if the input xml (input.xml) is like this
 *     <configuration>
 *         <property>
 *            <name> foobar </name>
 *            <value> barfoo </value>
 *         </property>
 *         <ignoreProperty>
 *           <name> foo </name>
 *         </ignoreProperty>
 *         <property>
 *            <name> justname </name>
 *         </property>
 *     </configuration>
 *
 *    And your pig script is like this
 *
 *    --load the jar files
 *    register /homes/aloks/pig/udfLib/loader.jar;
 *    -- load the dataset using XMLLoader
 *    -- A is the bag containing the tuple which contains one atom i.e doc see output
 *    A = load '/user/aloks/pig/input.xml using loader.XMLLoader('property') as (doc:chararray);
 *    --dump the result
 *    dump A;
 *
 *
 *    Then you will get the output
 *
 *    (<property>
 *             <name> foobar </name>
 *             <value> barfoo </value>
 *          </property>)
 *    (<property>
 *             <name> justname </name>
 *          </property>)
 *
 *
 *    Where each () indicate one record 
 *
 * 
 * </code>
 */

public class XMLLoader extends LoadFunc {

    /**
     * logger from pig
     */
    protected final Log mLog = LogFactory.getLog(getClass());

    private XMLFileRecordReader reader = null;

    /**
     * the tuple content which is used while returning
     */
    private ArrayList<Object> mProtoTuple = null;

    /**
     * The record seperated. The default value is 'document'
     */
    public String recordIdentifier = "document";

    private String loadLocation;

    public XMLLoader() {

    }

    /**
     * Constructs a Pig loader that uses specified string as the record seperater
     * for example if the recordIdentifier is document. It will consider the record as 
     * <document> .* </document>
     * 
     * @param recordIdentifier the xml tag which is used to pull records
     *
     */
    public XMLLoader(String recordIdentifier) {
        this();
        this.recordIdentifier = recordIdentifier;
    }

    /**
     * Retrieves the next tuple to be processed.
     * @return the next tuple to be processed or null if there are no more tuples
     * to be processed.
     * @throws IOException
     */
    @Override
    public Tuple getNext() throws IOException {

        boolean next = false;

        try {
            next = reader.nextKeyValue();
        } catch (InterruptedException e) {
            throw new IOException(e);
        }

        if (!next)
            return null;

        Tuple t = null;

        try {
            byte[] tagContent = (byte[]) reader.getCurrentValue();
            // No need to create the tuple if there are no contents
            t = (tagContent.length > 0) ? createTuple(tagContent) : null;
        } catch (Exception e) {
            throw new IOException(e);
        }

        return t;

    }

    public Tuple createTuple(byte[] content) throws Exception {
        if (mProtoTuple == null) {
            mProtoTuple = new ArrayList<Object>();
        }
        if (content.length > 0) {
            mProtoTuple.add(new DataByteArray(content));
        }
        Tuple t = TupleFactory.getInstance().newTupleNoCopy(mProtoTuple);
        mProtoTuple = null;

        return t;
    }

    /**
     * to check for equality 
     * @param object 
     */
    public boolean equals(Object obj) {
        return equals((XMLLoader) obj);
    }

    /**
     * to check for equality 
     * @param XMLLoader object 
     */
    public boolean equals(XMLLoader other) {
        return this.recordIdentifier.equals(other.recordIdentifier);
    }

    @SuppressWarnings("unchecked")
    @Override
    public InputFormat getInputFormat() throws IOException {
        XMLFileInputFormat inputFormat = new XMLFileInputFormat(recordIdentifier);
        if (loadLocation.endsWith(".bz2") || loadLocation.endsWith(".bz")) {
            inputFormat.isSplitable = true;
        }
        return inputFormat;
    }

    @SuppressWarnings("unchecked")
    @Override
    public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
        this.reader = (XMLFileRecordReader) reader;
    }

    @Override
    public void setLocation(String location, Job job) throws IOException {
        loadLocation = location;
        FileInputFormat.setInputPaths(job, location);
    }

    //------------------------------------------------------------------------
    // Implementation of InputFormat

    public static class XMLFileInputFormat extends FileInputFormat {

        /**
         * Boolean flag used to identify whether splittable property is explicitly set.
         */
        private boolean isSplitable = false;

        private String recordIdentifier;

        public XMLFileInputFormat(String recordIdentifier) {
            this.recordIdentifier = recordIdentifier;
        }

        @SuppressWarnings("unchecked")
        @Override
        public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context)
                throws IOException, InterruptedException {

            return new XMLFileRecordReader(recordIdentifier);
        }

        @Override
        protected boolean isSplitable(JobContext context, Path filename) {
            CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename);
            return (!(codec == null)) ? isSplitable : true;
        }
    }

    //------------------------------------------------------------------------
    // Implementation of RecordReader

    public static class XMLFileRecordReader extends RecordReader {

        private long start;
        private long end;
        private String recordIdentifier;

        /*
         * xmlloader input stream which has the ability to split the input
         * dataset into records by the specified tag
         */
        private XMLLoaderBufferedPositionedInputStream xmlLoaderBPIS = null;

        public XMLFileRecordReader(String recordIdentifier) {
            this.recordIdentifier = recordIdentifier;
        }

        @Override
        public void initialize(InputSplit genericSplit, TaskAttemptContext context)
                throws IOException, InterruptedException {
            FileSplit split = (FileSplit) genericSplit;
            Configuration job = context.getConfiguration();

            start = split.getStart();
            end = start + split.getLength();
            final Path file = split.getPath();

            // open the file and seek to the start of the split
            FileSystem fs = file.getFileSystem(job);
            FSDataInputStream fileIn = fs.open(split.getPath());

            // Seek to the start of the file
            fileIn.seek(start);

            if (file.toString().endsWith(".bz2") || file.toString().endsWith(".bz")) {
                // For bzip2 files use CBZip2InputStream to read and supply the upper input stream.
                CBZip2InputStream in = new CBZip2InputStream(fileIn, 9, end);
                this.xmlLoaderBPIS = new XMLLoaderBufferedPositionedInputStream(in, start, end);
            } else if (file.toString().endsWith(".gz")) {
                CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
                final CompressionCodec codec = compressionCodecs.getCodec(file);
                if (codec != null) {
                    end = Long.MAX_VALUE;
                    CompressionInputStream stream = codec.createInputStream(fileIn);
                    this.xmlLoaderBPIS = new XMLLoaderBufferedPositionedInputStream(stream, start, end);
                }
            }

            else {
                this.xmlLoaderBPIS = new XMLLoaderBufferedPositionedInputStream(fileIn, start, end);
            }
        }

        @Override
        public void close() throws IOException {
            xmlLoaderBPIS.close();
        }

        @Override
        public Object getCurrentKey() throws IOException, InterruptedException {
            return null;
        }

        @Override
        public Object getCurrentValue() throws IOException, InterruptedException {
            return xmlLoaderBPIS.collectTag(recordIdentifier, end);
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {

            return 0;
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            return xmlLoaderBPIS.isReadable();
        }

    }
}