org.apache.hadoop.zebra.io.ColumnGroup.java Source code

Introduction

Here is the source code for org.apache.hadoop.zebra.io.ColumnGroup.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package org.apache.hadoop.zebra.io;

import java.io.Closeable;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.permission.*;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.zebra.tfile.TFile;
import org.apache.hadoop.zebra.tfile.Utils;
import org.apache.hadoop.zebra.tfile.ByteArray;
import org.apache.hadoop.zebra.tfile.RawComparable;
import org.apache.hadoop.zebra.types.CGSchema;
import org.apache.hadoop.zebra.parser.ParseException;
import org.apache.hadoop.zebra.types.Partition;
import org.apache.hadoop.zebra.types.Projection;
import org.apache.hadoop.zebra.schema.Schema;
import org.apache.hadoop.zebra.types.TypesUtils;
import org.apache.hadoop.zebra.types.TypesUtils.TupleReader;
import org.apache.hadoop.zebra.types.TypesUtils.TupleWriter;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;

/**
 * ColumnGroup is the basic unit of a persistent table. The following
 * Configuration parameters can customize the behavior of ColumnGroup.
 * <ul>
 * <li><b>table.output.tfile.minBlock.size</b> (int) Minimum compression block
 * size for underlying TFile (default to 1024*1024).
 * <li><b>table.output.tfile.compression</b> (String) Compression method (one
 * of "none", "lzo", "gz") (default to "lzo").
 * 
 * @see {@link TFile#getSupportedCompressionAlgorithms()}
 *      <li><b>table.input.split.minSize</b> (int) Minimum split size (default
 *      to 64*1024).
 *      </ul>
 */
class ColumnGroup {
    static Log LOG = LogFactory.getLog(ColumnGroup.class);

    private final static String CONF_COMPRESS = "table.output.tfile.compression";
    private final static String DEFAULT_COMPRESS = "gz";
    private final static String CONF_MIN_BLOCK_SIZE = "table.tfile.minblock.size";
    private final static int DEFAULT_MIN_BLOCK_SIZE = 1024 * 1024;

    private final static String CONF_MIN_SPLIT_SIZE = "table.input.split.minSize";
    private final static int DEFAULT_MIN_SPLIT_SIZE = 64 * 1024;

    static final double SPLIT_SLOP = 1.1; // 10% slop

    // excluding files start with the following prefix, may change to regex
    private final static String CONF_NON_DATAFILE_PREFIX = "table.cg.nondatafile.prefix";
    private final static String SPECIAL_FILE_PREFIX = ".";

    // tmp schema file name, used as a flag of unfinished CG
    private final static String SCHEMA_FILE = ".schema";
    // meta data TFile for entire CG, used as a flag of closed CG
    final static String META_FILE = ".meta";

    // sorted table key ranges for default sorted table split generations
    private final static String KEY_RANGE_FOR_DEFAULT_SORTED_SPLIT = ".keyrange";

    static final String BLOCK_NAME_INDEX = "ColumnGroup.index";

    static Path makeMetaFilePath(Path parent) {
        return new Path(parent, META_FILE);
    }

    static String getCompression(Configuration conf) {
        return conf.get(CONF_COMPRESS, DEFAULT_COMPRESS);
    }

    static int getMinBlockSize(Configuration conf) {
        return conf.getInt(CONF_MIN_BLOCK_SIZE, DEFAULT_MIN_BLOCK_SIZE);
    }

    static String getNonDataFilePrefix(Configuration conf) {
        return conf.get(CONF_NON_DATAFILE_PREFIX, SPECIAL_FILE_PREFIX);
    }

    static int getMinSplitSize(Configuration conf) {
        return conf.getInt(CONF_MIN_SPLIT_SIZE, DEFAULT_MIN_SPLIT_SIZE);
    }

    /**
     * Drop a Column Group, maps to deleting all the files relating to this Column
     * Group on the FileSystem.
     * 
     * @param path
     *          the path to the ColumnGroup.
     * @param conf
     *          The configuration object.
     */
    public static void drop(Path path, Configuration conf) throws IOException {
        FileSystem fs = path.getFileSystem(conf);
        fs.delete(path, true);
        // TODO:
        // fs.close();
    }

    /**
     * Scan the file system, looking for TFiles, and build an in-memory index of a
     * column group.
     * 
     * @param fs
     *          The file system
     * @param path
     *          The base path of the column group.
     * @param dirty
     *          Whether to build dirty index or not. Dirty index is built by only
     *          looking at file-level status and not opening up individual TFiles.
     *          The flag may only be set for unsorted ColumnGroups.
     * @param conf
     *          The configuration object.
     * @return The in-memory index object.
     * @throws IOException
     */
    static CGIndex buildIndex(FileSystem fs, Path path, boolean dirty, Configuration conf) throws IOException {
        CGIndex ret = new CGIndex();
        CGPathFilter cgPathFilter = new CGPathFilter();
        CGPathFilter.setConf(conf);
        FileStatus[] files = fs.listStatus(path, cgPathFilter);

        Comparator<RawComparable> comparator = null;
        for (FileStatus f : files) {
            if (dirty) {
                ret.add(f.getLen(), f.getPath().getName());
            } else {
                FSDataInputStream dis = null;
                TFile.Reader tr = null;
                try {
                    dis = fs.open(f.getPath());
                    tr = new TFile.Reader(dis, f.getLen(), conf);
                    if (comparator == null) {
                        comparator = tr.getComparator();
                    }
                    if (tr.getEntryCount() > 0) {
                        CGIndexEntry range = new CGIndexEntry(f.getPath().getName(), tr.getEntryCount(),
                                tr.getFirstKey(), tr.getLastKey());
                        ret.add(f.getLen(), tr.getEntryCount(), range);
                    }
                } catch (IOException e) {
                    // TODO: log the error, ignore incorrect TFiles.
                    e.printStackTrace(System.err);
                } finally {
                    if (tr != null) {
                        tr.close();
                    }
                    if (dis != null) {
                        dis.close();
                    }
                }
            }
        }

        ret.sort(comparator);

        int idx = 0;
        for (CGIndexEntry e : ret.getIndex()) {
            e.setIndex(idx++);
        }

        return ret;
    }

    /**
     * ColumnGroup reader.
     */
    public static class Reader implements Closeable {
        Path path;
        Configuration conf;
        FileSystem fs;
        CGSchema cgschema;
        Comparator<RawComparable> comparator;
        Projection projection;
        CGIndex cgindex;
        ArrayList<SplitColumn> exec;
        SplitColumn top; // directly associated with logical schema
        SplitColumn leaf; // corresponding to projection
        boolean closed;
        boolean dirty;

        /**
         * Get the Column Group physical schema without loading the full CG index.
         * 
         * @param path
         *          The path to the ColumnGroup.
         * @param conf
         *          The configuration object.
         * @return The ColumnGroup schema.
         * @throws IOException
         */

        public static Schema getSchema(Path path, Configuration conf) throws IOException, ParseException {
            FileSystem fs = path.getFileSystem(conf);
            CGSchema cgschema = CGSchema.load(fs, path);
            return cgschema.getSchema();
        }

        /**
         * Create a ColumnGroup reader.
         * 
         * @param path
         *          The directory path to the column group.
         * @param conf
         *          Optional configuration parameters.
         * @throws IOException
         */
        public Reader(Path path, Configuration conf) throws IOException, ParseException {
            this(path, conf, false);
        }

        public Reader(Path path, Configuration conf, boolean mapper) throws IOException, ParseException {
            this(path, true, conf, mapper);
        }

        Reader(Path path, boolean dirty, Configuration conf) throws IOException, ParseException {
            this(path, dirty, conf, false);
        }

        Reader(Path path, boolean dirty, Configuration conf, boolean mapper) throws IOException, ParseException {
            this.path = path;
            this.conf = conf;
            this.dirty = dirty;

            fs = path.getFileSystem(conf);
            // check existence of path
            if (!fs.exists(path)) {
                throw new IOException("Path doesn't exist: " + path);
            }

            if (!mapper && !fs.getFileStatus(path).isDir()) {
                throw new IOException("Path exists but not a directory: " + path);
            }

            cgschema = CGSchema.load(fs, path);
            if (cgschema.isSorted()) {
                comparator = TFile.makeComparator(cgschema.getComparator());
            }
            projection = new Projection(cgschema.getSchema()); // default projection to CG schema.
            Path metaFilePath = makeMetaFilePath(path);
            /* If index file is not existing */
            if (!fs.exists(metaFilePath)) {
                throw new FileNotFoundException("Missing Meta File of " + metaFilePath);
            } else if (cgschema.isSorted()) {
                MetaFile.Reader metaFile = MetaFile.createReader(metaFilePath, conf);
                try {
                    cgindex = new CGIndex();
                    DataInputStream dis = metaFile.getMetaBlock(BLOCK_NAME_INDEX);
                    try {
                        cgindex.readFields(dis);
                    } catch (IOException e) {
                        throw new IOException("Index file read failure :" + e.getMessage());
                    } finally {
                        dis.close();
                    }
                } finally {
                    metaFile.close();
                }
            }
        }

        /**
         * Set the projection for the reader. This will affect calls to
         * getScanner(), getStatus(), and getColumnNames().
         * 
         * @param projection
         *          The projection on the column group for subsequent read
         *          operations. If we want select all columns, pass
         *          projection==null.
         */
        public synchronized void setProjection(String projection) throws ParseException {
            if (projection == null) {
                this.projection = new Projection(cgschema.getSchema());
            } else {
                this.projection = new Projection(cgschema.getSchema(), projection);
            }
        }

        /**
         * Get the schema of columns of the table (possibly through projection).
         * 
         * @return Schema of the columns of the table (possibly through projection).
         */
        public Schema getSchema() throws ParseException {
            return projection.getSchema();
        }

        /**
         * Get the projection
         * @return Projection of this Reader
         */
        public Projection getProjection() {
            return projection;
        }

        public String getName() {
            return cgschema.getName();
        }

        public String getSerializer() {
            return cgschema.getSerializer();
        }

        public String getCompressor() {
            return cgschema.getCompressor();
        }

        public CGSchema getCGSchema() {
            return cgschema;
        }

        public String getGroup() {
            return cgschema.getGroup();
        }

        public short getPerm() {
            return cgschema.getPerm();
        }

        /**
         * Get a scanner that reads all rows whose row keys fall in a specific
         * range.
         * 
         * @param beginKey
         *          The begin key of the scan range.
         * @param endKey
         *          The end key of the scan range.
         * @param closeReader
         *          close the underlying Reader object when we close the scanner.
         *          Should be set to true if we have only one scanner on top of the
         *          reader, so that we should release resources after the scanner is
         *          closed.
         * @return A scanner object.
         * @throws IOException
         */
        public synchronized CGScanner getScanner(BytesWritable beginKey, BytesWritable endKey, boolean closeReader)
                throws IOException, ParseException {
            if (closed) {
                throw new EOFException("Reader already closed");
            }
            if (!isSorted()) {
                throw new IOException("Cannot get key-bounded scanner for unsorted table");
            }
            RawComparable begin = (beginKey != null) ? new ByteArray(beginKey.getBytes(), 0, beginKey.getLength())
                    : null;
            RawComparable end = (endKey != null) ? new ByteArray(endKey.getBytes(), 0, endKey.getLength()) : null;
            if (begin != null && end != null) {
                if (comparator.compare(begin, end) >= 0) {
                    throw new IOException("Zero-key-range split");
                }
            }

            return new CGScanner(begin, end, closeReader);
        }

        /**
         * Get a scanner that reads a consecutive number of rows as defined in the
         * CGRangeSplit object, which should be obtained from previous calls of
         * rangeSplit().
         * 
         * @param split
         *          The split range. If null, get a scanner to read the complete
         *          column group.
         * @param closeReader
         *          close the underlying Reader object when we close the scanner.
         *          Should be set to true if we have only one scanner on top of the
         *          reader, so that we should release resources after the scanner is
         *          closed.
         * @return A scanner object.
         * @throws IOException
         */
        public synchronized CGScanner getScanner(CGRangeSplit split, boolean closeReader)
                throws IOException, ParseException {
            if (closed) {
                throw new EOFException("Reader already closed");
            }

            if (split == null) {
                if (cgindex == null)
                    cgindex = buildIndex(fs, path, dirty, conf);
                return getScanner(new CGRangeSplit(0, cgindex.size()), closeReader);
            }
            if (split.len < 0) {
                throw new IllegalArgumentException("Illegal range split");
            }

            return new CGScanner(split, closeReader);
        }

        /**
         * Get a scanner that reads the rows defined by rowRange. 
         * 
         * @param closeReader
         *          close the underlying Reader object when we close the scanner.
         *          Should be set to true if we have only one scanner on top of the
         *          reader, so that we should release resources after the scanner is
         *          closed.
         * @param rowSplit specifies part index, start row, and end row.
         * @return A scanner object.
         */
        public synchronized CGScanner getScanner(boolean closeReader, CGRowSplit rowSplit)
                throws IOException, ParseException {
            if (closed) {
                throw new EOFException("Reader already closed");
            }

            return new CGScanner(rowSplit, closeReader);
        }

        /**
         * Given a split range, calculate how the file data that fall into the range
         * are distributed among hosts.
         * 
         * @param split
         *          The range-based split. If null, return all blocks.
         * @return a map from host name to the amount of data (in bytes) the host
         *         owns that fall roughly into the key range.
         */
        public BlockDistribution getBlockDistribution(CGRangeSplit split) throws IOException {
            if (split == null) {
                return getBlockDistribution(new CGRangeSplit(0, cgindex.size()));
            }

            if (cgindex == null)
                cgindex = buildIndex(fs, path, dirty, conf);
            if ((split.start | split.len | (cgindex.size() - split.start - split.len)) < 0) {
                throw new IndexOutOfBoundsException("Bad split");
            }

            BlockDistribution ret = new BlockDistribution();
            for (int i = split.start; i < split.start + split.len; ++i) {
                CGIndexEntry dfkr = cgindex.get(i);
                Path tfilePath = new Path(path, dfkr.getName());
                FileStatus tfileStatus = fs.getFileStatus(tfilePath);
                BlockLocation[] locations = fs.getFileBlockLocations(tfileStatus, 0, tfileStatus.getLen());
                for (BlockLocation l : locations) {
                    ret.add(l);
                }
            }

            return ret;
        }

        /**
         * Given a row range, calculate how the file data that fall into the range
         * are distributed among hosts.
         * 
         * @param split
         *          The row-based split. If null, return all blocks.
         * @return a map from host name to the amount of data (in bytes) the host
         *         owns that fall roughly into the key range.
         */
        public BlockDistribution getBlockDistribution(CGRowSplit split) throws IOException {
            if (split == null) {
                throw new IOException("Row-based split cannot be null for getBlockDistribution()");
            }

            BlockDistribution ret = new BlockDistribution();
            for (int i = 0; i < split.length; i++) {
                FileStatus tfileStatus = fs.getFileStatus(new Path(path, split.names[i]));

                BlockLocation[] locations = null;
                long len = 0;
                if (i == 0) {
                    if (split.startByteFirst != -1) {
                        len = split.numBytesFirst;
                        locations = fs.getFileBlockLocations(tfileStatus, split.startByteFirst, len);
                    }
                } else if (i == split.length - 1) {
                    if (split.numBytesLast != -1) {
                        len = split.numBytesLast;
                        locations = fs.getFileBlockLocations(tfileStatus, 0, len);
                    }
                }

                if (locations == null) {
                    len = tfileStatus.getLen();
                    locations = fs.getFileBlockLocations(tfileStatus, 0, len);
                }

                for (BlockLocation l : locations) {
                    ret.add(l);
                }
            }
            return ret;
        }

        private int getStartBlockIndex(long[] startOffsets, long offset) {
            int index = Arrays.binarySearch(startOffsets, offset);
            if (index < 0)
                index = -index - 2;
            return index;
        }

        private int getEndBlockIndex(long[] startOffsets, long offset) {
            int index = Arrays.binarySearch(startOffsets, offset);
            if (index < 0)
                index = -index - 1;
            return index;
        }

        /**
         * Sets startRow and number of rows in rowSplit based on
         * startOffset and length.
         * 
         * It is assumed that 'startByte' and 'numBytes' in rowSplit itself
         * are not valid.
         */
        void fillRowSplit(CGRowSplit rowSplit, CGRowSplit src) throws IOException {

            if (src.names == null || src.length == 0)
                return;

            boolean noSizeInIndex = false;
            long[] sizes = rowSplit.sizes;
            if (sizes == null) {
                /* the on disk table is sorted. Later this will be made unnecessary when
                 * CGIndexEntry serializes its bytes field and the meta file versioning is
                 * supported.
                 */
                noSizeInIndex = true;
            }
            rowSplit.names = src.names;
            rowSplit.length = src.length;
            rowSplit.startByteFirst = src.startByteFirst;
            rowSplit.numBytesFirst = src.numBytesFirst;
            rowSplit.numBytesLast = src.numBytesLast;

            Path firstPath = null, lastPath;
            TFile.Reader reader = null;

            if (src.startByteFirst != -1) {
                firstPath = new Path(path, rowSplit.names[0]);
                long size;
                if (noSizeInIndex) {
                    FileStatus tfile = fs.getFileStatus(firstPath);
                    size = tfile.getLen();
                } else
                    size = sizes[0];
                reader = new TFile.Reader(fs.open(firstPath), size, conf);
                try {
                    long startRow = reader.getRecordNumNear(src.startByteFirst);
                    long endRow = reader.getRecordNumNear(src.startByteFirst + src.numBytesFirst);

                    if (endRow < startRow)
                        endRow = startRow;
                    rowSplit.startRowFirst = startRow;
                    rowSplit.numRowsFirst = endRow - startRow;
                } catch (IOException e) {
                    reader.close();
                    throw e;
                }
            }
            if (src.numBytesLast != -1 && rowSplit.length > 1) {
                lastPath = new Path(path, rowSplit.names[rowSplit.length - 1]);
                if (reader == null || !firstPath.equals(lastPath)) {
                    if (reader != null)
                        reader.close();
                    long size;
                    if (noSizeInIndex) {
                        FileStatus tfile = fs.getFileStatus(lastPath);
                        size = tfile.getLen();
                    } else
                        size = sizes[rowSplit.length - 1];
                    reader = new TFile.Reader(fs.open(lastPath), size, conf);
                }
                try {
                    long endRow = reader.getRecordNumNear(src.numBytesLast);
                    rowSplit.numRowsLast = endRow;
                } catch (IOException e) {
                    reader.close();
                    throw e;
                }
            }
            if (reader != null)
                reader.close();
        }

        /**
         * Get a sampling of keys and calculate how data are distributed among
         * key-partitioned buckets. The implementation attempts to calculate all
         * information in one shot to avoid reading TFile index multiple times.
         * Special care is also taken that memory requirement is not linear to the
         * size of total data set for the column group.
         * 
         * @param n
         *          Targeted size of the sampling.
         * @param nTables
         *          Number of tables in a union
         * @return KeyDistribution object.
         * @throws IOException
         */
        public KeyDistribution getKeyDistribution(int n, int nTables, BlockDistribution lastBd) throws IOException {
            // TODO: any need for similar capability for unsorted for sorted CGs?
            if (!isSorted()) {
                throw new IOException("Cannot get key distribution for unsorted table");
            }
            KeyDistribution ret = new KeyDistribution(comparator);

            if (n < 0) {
                /*
                Path keyRangeFile = new Path(path, KEY_RANGE_FOR_DEFAULT_SORTED_SPLIT);
                if (fs.exists(keyRangeFile))
                {
                  try {
                    FSDataInputStream ins = fs.open(keyRangeFile);
                    long minStepSize = ins.readLong();
                    int size = ins.readInt();
                    for (int i = 0; i < size; i++)
                    {
                      BytesWritable keyIn = new BytesWritable();
                      keyIn.readFields(ins);
                      ByteArray key = new ByteArray(keyIn.getBytes());
                      ret.add(key);
                    }
                    ret.setMinStepSize(minStepSize);
                    return ret;
                  } catch (Exception e) {
                    // no-op
                  }
                }
                */
                n = 1;
            }

            Path[] paths = new Path[cgindex.size()];
            FileStatus[] tfileStatus = new FileStatus[paths.length];
            long totalBytes = 0;
            for (int i = 0; i < paths.length; ++i) {
                paths[i] = cgindex.getPath(i, path);
                tfileStatus[i] = fs.getFileStatus(paths[i]);
                totalBytes += tfileStatus[i].getLen();
            }

            final long minSize = getMinSplitSize(conf);
            final long EPSILON = (long) (minSize * (SPLIT_SLOP - 1));
            long goalSize = totalBytes / n;
            long batchSize = 0;
            BlockDistribution bd = new BlockDistribution();
            ;
            RawComparable prevKey = null;

            long minStepSize = -1;
            FSDataInputStream nextFsdis = null;
            TFile.Reader nextReader = null;
            for (int i = 0; i < paths.length; ++i) {
                FileStatus fstatus = tfileStatus[i];
                long blkSize = fstatus.getBlockSize();
                long fileLen = fstatus.getLen();
                long stepSize = Math.max(minSize, (goalSize < blkSize) ? goalSize : blkSize);
                if (minStepSize == -1 || minStepSize > stepSize)
                    minStepSize = stepSize;
                // adjust the block size by the scaling factor
                blkSize /= nTables;
                stepSize = Math.max(minSize, (goalSize < blkSize) ? goalSize : blkSize);
                FSDataInputStream fsdis = null;
                TFile.Reader reader = null;
                long remainLen = fileLen;
                try {
                    if (nextReader == null) {
                        fsdis = fs.open(paths[i]);
                        reader = new TFile.Reader(fsdis, fileLen, conf);
                    } else {
                        fsdis = nextFsdis;
                        reader = nextReader;
                    }
                    BlockLocation[] locations = fs.getFileBlockLocations(fstatus, 0, fileLen);
                    if (locations.length == 0) {
                        throw new AssertionError("getFileBlockLocations returns 0 location");
                    }

                    Arrays.sort(locations, new Comparator<BlockLocation>() {
                        @Override
                        public int compare(BlockLocation o1, BlockLocation o2) {
                            long diff = o1.getOffset() - o2.getOffset();
                            if (diff < 0)
                                return -1;
                            if (diff > 0)
                                return 1;
                            return 0;
                        }
                    });

                    long[] startOffsets = new long[locations.length];

                    for (int ii = 0; ii < locations.length; ii++)
                        startOffsets[ii] = locations[ii].getOffset();

                    boolean done = false;
                    while ((remainLen > 0) && !done) {
                        long splitBytes = remainLen > stepSize ? stepSize : remainLen;
                        long offsetBegin = fileLen - remainLen;
                        long offsetEnd = offsetBegin + splitBytes;
                        int indexBegin = getStartBlockIndex(startOffsets, offsetBegin);
                        int indexEnd = getEndBlockIndex(startOffsets, offsetEnd);
                        BlockLocation firstBlock = locations[indexBegin];
                        BlockLocation lastBlock = locations[indexEnd - 1];
                        long lastBlockOffsetBegin = lastBlock.getOffset();
                        long lastBlockOffsetEnd = lastBlockOffsetBegin + lastBlock.getLength();
                        if ((firstBlock.getOffset() > offsetBegin) || (lastBlockOffsetEnd < offsetEnd)) {
                            throw new AssertionError(
                                    "Block locations returned by getFileBlockLocations do not cover requested range");
                        }

                        // Adjust offsets
                        if ((offsetEnd > lastBlockOffsetBegin) && (offsetEnd - lastBlockOffsetBegin < EPSILON)) {
                            // the split includes a bit of the next block, remove it.
                            if (offsetEnd != fileLen) {
                                // only if this is not the last chunk
                                offsetEnd = lastBlockOffsetBegin;
                                splitBytes = offsetEnd - offsetBegin;
                                indexEnd--;
                            }
                        } else if ((lastBlockOffsetEnd > offsetEnd) && (lastBlockOffsetEnd - offsetEnd < EPSILON)) {
                            // the split includes almost the whole block, fill it.
                            offsetEnd = lastBlockOffsetEnd;
                            splitBytes = offsetEnd - offsetBegin;
                        }

                        RawComparable key = reader.getKeyNear(offsetEnd);
                        if (key == null) {
                            offsetEnd = fileLen;
                            splitBytes = offsetEnd - offsetBegin;
                            if (i < paths.length - 1) {
                                nextFsdis = fs.open(paths[i + 1]);
                                nextReader = new TFile.Reader(nextFsdis, tfileStatus[i + 1].getLen(), conf);
                                key = nextReader.getFirstKey();
                            }
                            done = true; // TFile index too large? Is it necessary now?
                        }
                        remainLen -= splitBytes;
                        batchSize += splitBytes;

                        if (key != null && batchSize >= stepSize) {
                            if (batchSize - splitBytes < EPSILON || splitBytes < EPSILON) {
                                // the last chunk or this chunk is small enough to create a new range for this key
                                setBlockDistribution(bd, reader, locations, fstatus, startOffsets, prevKey, key);
                                ret.add(key, bd);
                                batchSize = 0;
                                bd = new BlockDistribution();
                            } else {
                                ret.add(prevKey, bd);
                                batchSize = splitBytes;
                                bd = new BlockDistribution();
                                if (batchSize >= stepSize) {
                                    setBlockDistribution(bd, reader, locations, fstatus, startOffsets, prevKey,
                                            key);
                                    ret.add(key, bd);
                                    batchSize = 0;
                                    bd = new BlockDistribution();
                                } else {
                                    setBlockDistribution(bd, reader, locations, fstatus, startOffsets, prevKey,
                                            key);
                                }
                            }
                        } else {
                            setBlockDistribution(bd, reader, locations, fstatus, startOffsets, prevKey, key);
                        }
                        prevKey = key;
                    }
                } finally {
                    if (reader != null) {
                        try {
                            reader.close();
                        } catch (Exception e) {
                            // no-op;
                        }
                    }
                    if (fsdis != null) {
                        try {
                            fsdis.close();
                        } catch (Exception e) {
                            // no-op
                        }
                    }
                }
            }
            if (lastBd != null)
                lastBd.add(bd);
            ret.setMinStepSize(minStepSize);

            return ret;
        }

        private void setBlockDistribution(BlockDistribution bd, TFile.Reader reader, BlockLocation[] locations,
                FileStatus fileStatus, long[] startOffsets, RawComparable begin, RawComparable end)
                throws IOException {
            long beginOffset, endOffset = -1;
            if (begin == null)
                beginOffset = 0;
            else
                beginOffset = reader.getOffsetForKey(begin);
            if (end != null) {
                if (begin == null)
                    begin = reader.getFirstKey();
                /* Only if the key range is empty. This is needed because TFile has a 16-byte
                 * Magic that causes getOffsetForKey to return 16 (not 0) even on the first key.
                 */
                if (comparator.compare(begin, end) != 0)
                    endOffset = reader.getOffsetForKey(end);
            }
            int startBlockIndex = (beginOffset == 0 ? 0 : getStartBlockIndex(startOffsets, beginOffset));
            BlockLocation l;
            int endBlockIndex = (end == null ? locations.length
                    : endOffset == -1 ? startBlockIndex : getEndBlockIndex(startOffsets, endOffset));
            for (int ii = startBlockIndex; ii < endBlockIndex; ii++) {
                l = locations[ii];
                long blkBeginOffset = l.getOffset();
                long blkEndOffset = blkBeginOffset + l.getLength();
                if (blkEndOffset > blkBeginOffset) {
                    bd.add(l, blkEndOffset - blkBeginOffset);
                }
            }
            return;
        }

        /**
         * Get the status of the ColumnGroup.
         */
        public BasicTableStatus getStatus() throws IOException {
            if (cgindex == null)
                cgindex = buildIndex(fs, path, dirty, conf);
            return cgindex.status;
        }

        /**
         * Split the ColumnGroup by file orders.
         * 
         * @param n
         *          Targeted number of partitions.
         * @return A list of range-based splits, whose size may be less than or
         *         equal to n.
         */
        public List<CGRangeSplit> rangeSplit(int n) throws IOException {
            // The output of this method must be only dependent on the cgindex and
            // input parameter n - so that horizontally stitched column groups will
            // get aligned splits.
            if (cgindex == null)
                cgindex = buildIndex(fs, path, dirty, conf);
            int numFiles = cgindex.size();
            if ((numFiles < n) || (n < 0)) {
                return rangeSplit(numFiles);
            }
            List<CGRangeSplit> lst = new ArrayList<CGRangeSplit>();
            int beginIndex = 0;
            for (int i = 0; i < n; ++i) {
                int endIndex = (int) ((long) (i + 1) * numFiles / n);
                lst.add(new CGRangeSplit(beginIndex, endIndex - beginIndex));
                beginIndex = endIndex;
            }
            return lst;
        }

        /**
         * We already use FileInputFormat to create byte offset-based input splits.
         * Their information is encoded in starts, lengths and paths. This method is 
         * to wrap this information to form CGRowSplit objects at column group level.
         * 
         * @param starts array of starting byte of fileSplits.
         * @param lengths array of length of fileSplits.
         * @param paths array of path of fileSplits.
         * @return A list of CGRowSplit objects. 
         *         
         */
        public List<CGRowSplit> rowSplit(long[] starts, long[] lengths, Path[] paths, int[] batches, int numBatches)
                throws IOException {
            List<CGRowSplit> lst = new ArrayList<CGRowSplit>();
            CGRowSplit cgRowSplit;
            long startFirst, bytesFirst, bytesLast;
            int length;

            if (numBatches == 0) {
                cgRowSplit = new CGRowSplit(null, null, 0, -1, 0, 0);
                lst.add(cgRowSplit);
                return lst;
            }

            if (cgindex == null)
                cgindex = buildIndex(fs, this.path, dirty, conf);

            if (cgindex.size() == 0) {
                cgRowSplit = new CGRowSplit(null, null, 0, -1, 0, 0);
                lst.add(cgRowSplit);
                return lst;
            }

            for (int i = 0; i < numBatches; i++) {
                int indexFirst = batches[i];
                int indexLast = batches[i + 1] - 1;
                startFirst = starts[indexFirst];
                bytesFirst = lengths[indexFirst];
                bytesLast = lengths[indexLast];
                length = batches[i + 1] - batches[i];
                String[] namesInSplit = new String[length];
                long[] sizesInSplit = new long[length];
                for (int j = 0; j < length; j++) {
                    namesInSplit[j] = paths[indexFirst + j].getName();
                    sizesInSplit[j] = cgindex.get(cgindex.getFileIndex(paths[indexFirst + j])).bytes;
                }
                cgRowSplit = new CGRowSplit(namesInSplit, sizesInSplit, length, startFirst, bytesFirst, bytesLast);
                lst.add(cgRowSplit);
            }

            return lst;
        }

        void rearrangeFileIndices(FileStatus[] fileStatus) throws IOException {
            int size = fileStatus.length;
            FileStatus[] result = new FileStatus[size];
            if (cgindex == null)
                cgindex = buildIndex(fs, path, dirty, conf);
            if (size < cgindex.size())
                throw new AssertionError("Incorrect file list size");
            for (int j, i = 0; i < size; i++) {
                j = cgindex.getFileIndex(fileStatus[i].getPath());
                if (j != -1)
                    result[j] = fileStatus[i];
            }
            for (int i = 0; i < size; i++)
                fileStatus[i] = result[i];
        }

        /**
         * Is the ColumnGroup sorted?
         * 
         * @return Whether the ColumnGroup is sorted.
         */
        public boolean isSorted() {
            return cgschema.isSorted();
        }

        @Override
        public void close() throws IOException {
            if (!closed) {
                closed = true;
            }
        }

        /**
         * A simple wrapper class over TFile.Reader.Scanner to simplify the creation
         * and resource management.
         */
        static class TFileScanner implements Closeable {
            boolean closed = true;
            FSDataInputStream ins;
            TFile.Reader reader;
            TFile.Reader.Scanner scanner;
            TupleReader tupleReader;

            TFileScanner(FileSystem fs, Path path, CGRowSplit rowRange, RawComparable begin, RawComparable end,
                    boolean first, boolean last, CGSchema cgschema, Projection projection, Configuration conf)
                    throws IOException, ParseException {
                try {
                    ins = fs.open(path);
                    /*
                     * compressor is inside cgschema
                     */
                    reader = new TFile.Reader(ins, fs.getFileStatus(path).getLen(), conf);
                    if (rowRange != null && rowRange.startByteFirst != -1) {
                        if (first && rowRange.startByteFirst != -1)
                            scanner = reader.createScannerByRecordNum(rowRange.startRowFirst,
                                    rowRange.startRowFirst + rowRange.numRowsFirst);
                        else if (last && rowRange.numBytesLast != -1)
                            scanner = reader.createScannerByRecordNum(0, rowRange.numRowsLast);
                        else
                            scanner = reader.createScanner();
                    } else {
                        /* TODO: more investigation is needed for the following.
                         *  using deprecated API just so that zebra can work with 
                         * hadoop jar that does not contain HADOOP-6218 (Record ids for
                         * TFile). This is expected to be temporary. Later we should 
                         * use the undeprecated API.
                         */
                        scanner = reader.createScanner(begin, end);
                    }
                    /*
                     * serializer is inside cgschema: different serializer will require
                     * different Reader: for pig, it's TupleReader
                     */
                    tupleReader = new TupleReader(cgschema.getSchema(), projection);
                    closed = false;
                } finally {
                    if (closed == true) { // failed to instantiate the object.
                        if (scanner != null) {
                            try {
                                scanner.close();
                            } catch (Exception e) {
                                // no-op
                            }
                        }

                        if (reader != null) {
                            try {
                                reader.close();
                            } catch (Exception e) {
                                // no op
                            }
                        }

                        if (ins != null) {
                            try {
                                ins.close();
                            } catch (Exception e) {
                                // no op
                            }
                        }
                    }
                }
            }

            void rewind() throws IOException {
                scanner.rewind();
            }

            void getKey(BytesWritable key) throws IOException {
                scanner.entry().getKey(key);
            }

            void getValue(Tuple val) throws IOException, ParseException {
                DataInputStream dis = scanner.entry().getValueStream();
                try {
                    tupleReader.get(dis, val);

                } finally {
                    dis.close();
                }
            }

            boolean seekTo(BytesWritable key) throws IOException {
                return scanner.seekTo(key.getBytes(), 0, key.getLength());
            }

            boolean advance() throws IOException {
                return scanner.advance();
            }

            boolean atEnd() {
                return scanner.atEnd();
            }

            void seekToEnd() throws IOException {
                scanner.seekToEnd();
            }

            @Override
            public void close() throws IOException {
                if (!closed) {
                    closed = true;
                    try {
                        scanner.close();
                    } catch (Exception e) {
                        // no op
                    }

                    try {
                        reader.close();
                    } catch (Exception e) {
                        // no op
                    }

                    try {
                        ins.close();
                    } catch (Exception e) {
                        // no op
                    }
                }
            }
        }

        /**
         * ColumnGroup scanner
         */
        class CGScanner implements TableScanner {
            private Projection logicalSchema = null;
            private TFileScannerInfo[] scanners;
            private boolean closeReader;
            private int beginIndex, endIndex;
            private int current; // current scanner
            private boolean scannerClosed = true;
            private CGRowSplit rowRange;
            private TFileScanner scanner;

            private class TFileScannerInfo {
                boolean first, last;
                Path path;
                RawComparable begin, end;

                TFileScannerInfo(boolean first, boolean last, Path path, RawComparable begin, RawComparable end) {
                    this.first = first;
                    this.last = last;
                    this.begin = begin;
                    this.end = end;
                    this.path = path;
                }

                TFileScanner getTFileScanner() throws IOException {
                    try {
                        return new TFileScanner(fs, path, rowRange, begin, end, first, last, cgschema,
                                logicalSchema, conf);
                    } catch (ParseException e) {
                        throw new IOException(e.getMessage());
                    }
                }
            }

            CGScanner(CGRangeSplit split, boolean closeReader) throws IOException, ParseException {
                if (cgindex == null)
                    cgindex = buildIndex(fs, path, dirty, conf);
                if (split == null) {
                    beginIndex = 0;
                    endIndex = cgindex.size();
                } else {
                    beginIndex = split.start;
                    endIndex = split.start + split.len;
                }
                init(null, null, null, closeReader);
            }

            /**
             * Scanner for a range specified by the given row range.
             * 
             * @param rowRange see {@link CGRowSplit}
             * @param closeReader
             */
            CGScanner(CGRowSplit rowRange, boolean closeReader) throws IOException, ParseException {
                beginIndex = 0;
                endIndex = rowRange.length;
                init(rowRange, null, null, closeReader);
            }

            CGScanner(RawComparable beginKey, RawComparable endKey, boolean closeReader)
                    throws IOException, ParseException {
                beginIndex = 0;
                endIndex = cgindex.size();
                if (beginKey != null) {
                    beginIndex = cgindex.lowerBound(beginKey, comparator);
                }
                if (endKey != null) {
                    endIndex = cgindex.lowerBound(endKey, comparator);
                    if (endIndex < cgindex.size()) {
                        ++endIndex;
                    }
                }
                init(null, beginKey, endKey, closeReader);
            }

            private void init(CGRowSplit rowRange, RawComparable beginKey, RawComparable endKey, boolean doClose)
                    throws IOException, ParseException {
                this.rowRange = rowRange;
                if (beginIndex > endIndex) {
                    throw new IllegalArgumentException("beginIndex > endIndex");
                }
                logicalSchema = ColumnGroup.Reader.this.getProjection();
                List<TFileScannerInfo> tmpScanners = new ArrayList<TFileScannerInfo>(endIndex - beginIndex);
                try {
                    boolean first, last, realFirst = true;
                    Path myPath;
                    for (int i = beginIndex; i < endIndex; ++i) {
                        first = (i == beginIndex);
                        last = (i == endIndex - 1);
                        RawComparable begin = first ? beginKey : null;
                        RawComparable end = last ? endKey : null;
                        TFileScannerInfo scanner;
                        if (rowRange == null)
                            myPath = cgindex.getPath(i, path);
                        else
                            myPath = new Path(path, rowRange.names[i]);
                        scanner = new TFileScannerInfo(first, last, myPath, begin, end);
                        if (realFirst) {
                            this.scanner = scanner.getTFileScanner();
                            if (this.scanner.atEnd()) {
                                this.scanner.close();
                                this.scanner = null;
                            } else {
                                realFirst = false;
                                tmpScanners.add(scanner);
                            }
                        } else {
                            TFileScanner myScanner = scanner.getTFileScanner();
                            if (!myScanner.atEnd())
                                tmpScanners.add(scanner);
                            myScanner.close();
                        }
                    }
                    scanners = tmpScanners.toArray(new TFileScannerInfo[tmpScanners.size()]);
                    this.closeReader = doClose;
                    scannerClosed = false;
                } finally {
                    if (scannerClosed) { // failed to initialize the object.
                        if (scanner != null)
                            scanner.close();
                    }
                }
            }

            @Override
            public void getKey(BytesWritable key) throws IOException {
                if (atEnd()) {
                    throw new EOFException("No more key-value to read");
                }
                scanner.getKey(key);
            }

            @Override
            public void getValue(Tuple row) throws IOException {
                if (atEnd()) {
                    throw new EOFException("No more key-value to read");
                }
                try {
                    scanner.getValue(row);
                } catch (ParseException e) {
                    throw new IOException("Invalid Projection: " + e.getMessage());
                }
            }

            public void getCGKey(BytesWritable key) throws IOException {
                scanner.getKey(key);
            }

            public void getCGValue(Tuple row) throws IOException {
                try {
                    scanner.getValue(row);
                } catch (ParseException e) {
                    throw new IOException("Invalid Projection: " + e.getMessage());
                }
            }

            @Override
            public String getProjection() {
                return logicalSchema.toString();
            }

            public Schema getSchema() {
                return logicalSchema.getSchema();
            }

            @Override
            public boolean advance() throws IOException {
                if (atEnd()) {
                    return false;
                }
                scanner.advance();
                while (true) {
                    if (scanner.atEnd()) {
                        scanner.close();
                        scanner = null;
                        ++current;
                        if (!atEnd()) {
                            scanner = scanners[current].getTFileScanner();
                        } else
                            return false;
                    } else
                        return true;
                }
            }

            public boolean advanceCG() throws IOException {
                scanner.advance();
                while (true) {
                    if (scanner.atEnd()) {
                        scanner.close();
                        scanner = null;
                        ++current;
                        if (!atEnd()) {
                            scanner = scanners[current].getTFileScanner();
                        } else
                            return false;
                    } else
                        return true;
                }
            }

            @Override
            public boolean atEnd() throws IOException {
                return (current >= scanners.length);
            }

            @Override
            public boolean seekTo(BytesWritable key) throws IOException {
                if (!isSorted()) {
                    throw new IOException("Cannot seek in unsorted Column Gruop");
                }
                if (atEnd()) {
                    return false;
                }
                int index = cgindex.lowerBound(new ByteArray(key.getBytes(), 0, key.getLength()), comparator);
                if (index >= endIndex) {
                    seekToEnd();
                    return false;
                }

                if ((index < beginIndex)) {
                    // move to the beginning
                    index = beginIndex;
                }

                int prevCurrent = current;
                current = index - beginIndex;
                if (current != prevCurrent) {
                    if (scanner != null) {
                        try {
                            scanner.close();
                        } catch (Exception e) {
                            // no-op
                        }
                    }
                    scanner = scanners[current].getTFileScanner();
                }
                return scanner.seekTo(key);
            }

            @Override
            public void seekToEnd() throws IOException {
                if (scanner != null) {
                    try {
                        scanner.close();
                    } catch (Exception e) {
                        // no-op
                    }
                }
                scanner = null;
                current = scanners.length;
            }

            @Override
            public void close() throws IOException {
                if (!scannerClosed) {
                    scannerClosed = true;
                    if (scanner != null) {
                        try {
                            scanner.close();
                            scanner = null;
                        } catch (Exception e) {
                            // no-op
                        }
                    }
                    if (closeReader) {
                        Reader.this.close();
                    }
                }
            }
        }

        public static class CGRangeSplit implements Writable {
            int start = 0; // starting index in the list
            int len = 0;

            CGRangeSplit(int start, int len) {
                this.start = start;
                this.len = len;
            }

            public CGRangeSplit() {
                // no-op;
            }

            @Override
            public void readFields(DataInput in) throws IOException {
                start = Utils.readVInt(in);
                len = Utils.readVInt(in);
            }

            @Override
            public void write(DataOutput out) throws IOException {
                Utils.writeVInt(out, start);
                Utils.writeVInt(out, len);
            }
        }

        public static class CGRowSplit implements Writable {
            int length; // number of files in the batch
            long startByteFirst = -1;
            long numBytesFirst;
            long startRowFirst = -1;
            long numRowsFirst = -1;
            long numBytesLast = -1;
            long numRowsLast = -1;
            String[] names;
            long[] sizes = null;

            CGRowSplit(String[] names, long[] sizes, int length, long startFirst, long bytesFirst, long bytesLast)
                    throws IOException {
                this.names = names;
                this.sizes = sizes;
                this.length = length;

                if (startFirst != -1) {
                    startByteFirst = startFirst;
                    numBytesFirst = bytesFirst;
                }
                if (bytesLast != -1 && this.length > 1) {
                    numBytesLast = bytesLast;
                }
            }

            public CGRowSplit() {
                // no-op;
            }

            @Override
            public String toString() {
                StringBuilder sb = new StringBuilder();
                sb.append("{length = " + length + "}\n");
                for (int i = 0; i < length; i++) {
                    sb.append("{name = " + names[i] + "}\n");
                    sb.append("{size = " + sizes[i] + "}\n");
                }
                sb.append("{startByteFirst = " + startByteFirst + "}\n");
                sb.append("{numBytesFirst = " + numBytesFirst + "}\n");
                sb.append("{startRowFirst = " + startRowFirst + "}\n");
                sb.append("{numRowsFirst = " + numRowsFirst + "}\n");
                sb.append("{numBytesLast = " + numBytesLast + "}\n");
                sb.append("{numRowsLast = " + numRowsLast + "}\n");

                return sb.toString();
            }

            @Override
            public void readFields(DataInput in) throws IOException {
                length = Utils.readVInt(in);
                if (length > 0) {
                    names = new String[length];
                    sizes = new long[length];
                }
                for (int i = 0; i < length; i++) {
                    names[i] = Utils.readString(in);
                    sizes[i] = Utils.readVLong(in);
                }
                startByteFirst = Utils.readVLong(in);
                numBytesFirst = Utils.readVLong(in);
                startRowFirst = Utils.readVLong(in);
                numRowsFirst = Utils.readVLong(in);
                numBytesLast = Utils.readVLong(in);
                numRowsLast = Utils.readVLong(in);
            }

            @Override
            public void write(DataOutput out) throws IOException {
                Utils.writeVInt(out, length);
                for (int i = 0; i < length; i++) {
                    Utils.writeString(out, names[i]);
                    Utils.writeVLong(out, sizes[i]);
                }
                Utils.writeVLong(out, startByteFirst);
                Utils.writeVLong(out, numBytesFirst);
                Utils.writeVLong(out, startRowFirst);
                Utils.writeVLong(out, numRowsFirst);
                Utils.writeVLong(out, numBytesLast);
                Utils.writeVLong(out, numRowsLast);
            }
        }

        private static class SplitColumn {
            SplitColumn(Partition.SplitType st) {
                this.st = st;
            }

            SplitColumn(int fieldIndex, Partition.SplitType st) {
                this.fieldIndex = fieldIndex;
                this.st = st;
            }

            SplitColumn(int fieldIndex, String key, Partition.SplitType st) {
                this.fieldIndex = fieldIndex;
                this.key = key;
                this.st = st;
            }

            SplitColumn(int fieldIndex, int projIndex, SplitColumn leaf, String key, Partition.SplitType st) {
                this(fieldIndex, key, st);
                this.projIndex = projIndex;
            }

            int fieldIndex = -1; // field index to parent
            int projIndex = -1; // index in projection: only used by leaves
            SplitColumn leaf = null;
            String key = null; // MAP key to parent
            ArrayList<SplitColumn> children = null;
            int index = -1; // index in the logical schema
            Object field = null;
            Partition.SplitType st = Partition.SplitType.NONE;

            void dispatch(Object field) {
                this.field = field;
            }

            @SuppressWarnings("unchecked")
            void split() throws ExecException {
                int size = children.size();
                if (st == Partition.SplitType.RECORD) {
                    for (int i = 0; i < size; i++) {
                        if (children.get(i).projIndex != -1) // a leaf: set projection
                                                             // directly
                            ((Tuple) (leaf.field)).set(projIndex, ((Tuple) field).get(children.get(i).fieldIndex));
                        else
                            children.get(i).field = ((Tuple) field).get(children.get(i).fieldIndex);
                    }
                } else if (st == Partition.SplitType.MAP) {
                    for (int i = 0; i < size; i++) {
                        if (children.get(i).projIndex != -1) // a leaf: set projection
                                                             // directly
                            ((Tuple) (leaf.field)).set(projIndex,
                                    ((Map<String, Object>) field).get(children.get(i).key));
                        else
                            children.get(i).field = ((Map<String, Object>) field).get(children.get(i).key);
                    }
                }
            }

            void addChild(SplitColumn child) {
                if (children == null)
                    children = new ArrayList<SplitColumn>();
                children.add(child);
            }
        }
    }

    /**
     * Column Group writer.
     */
    public static class Writer implements Closeable {
        Path path;
        Path finalOutputPath;
        Configuration conf;
        FileSystem fs;
        CGSchema cgschema;
        private boolean finished, closed;
        CGIndex index;

        /**
         * Create a ColumnGroup writer. The semantics are as follows:
         * <ol>
         * <li>If path does not exist:
         * <ul>
         * <li>create the path directory
         * <li>write out the meta data file.
         * </ul>
         * <li>If path exists and the directory is empty: write out the meta data
         * file.
         * <li>If path exists and contains what look like a complete Column Group,
         * ColumnGroupExists exception will be thrown.
         * <li>If path exists and overwrite is true, remove all files under the
         * directory and resume as in Step 2.
         * <li>If path exists directory not empty and overwrite= false,
         * ColumnGroupExists will be thrown.
         * </ol>
         * This constructor never removes a valid/complete ColumnGroup.
         * 
         * @param path
         *          The path to the Column Group, either not existent or must be a
         *          directory.
         * @param schema
         *          The schema of the ColumnGroup. For this version of
         *          implementation, the schema of a table is a comma separated list
         *          of column names, such as "FirstName, LastName, Sex, Department".
         * @param sorted
         *          Whether the column group to be created is sorted or not. If set
         *          to true, we expect the rows inserted by every inserter created
         *          from this Writer must be sorted. Additionally, there exists an
         *          ordering of the inserters Ins-1, Ins-2, ... such that the rows
         *          created by Ins-1, followed by rows created by Ins-2, ... form a
         *          total order.
         * @param overwrite
         *          Should we overwrite the path if it already exists?
         * @param conf
         *          The optional configuration objects.
         * @throws IOException
         */
        public Writer(Path path, String schema, boolean sorted, String name, String serializer, String compressor,
                String owner, String group, short perm, boolean overwrite, Configuration conf)
                throws IOException, ParseException {
            this(path, new Schema(schema), sorted, null, name, serializer, compressor, owner, group, perm,
                    overwrite, conf);
        }

        public Writer(Path path, Schema schema, boolean sorted, String name, String serializer, String compressor,
                String owner, String group, short perm, boolean overwrite, Configuration conf)
                throws IOException, ParseException {
            this(path, schema, sorted, null, name, serializer, compressor, owner, group, perm, overwrite, conf);
        }

        public Writer(Path path, String schema, boolean sorted, String comparator, String name, String serializer,
                String compressor, String owner, String group, short perm, boolean overwrite, Configuration conf)
                throws IOException, ParseException {
            this(path, new Schema(schema), sorted, comparator, name, serializer, compressor, owner, group, perm,
                    overwrite, conf);
        }

        public Writer(Path path, Schema schema, boolean sorted, String comparator, String name, String serializer,
                String compressor, String owner, String group, short perm, boolean overwrite, Configuration conf)
                throws IOException, ParseException {
            this.path = path;
            this.conf = conf;
            this.finalOutputPath = path;

            fs = path.getFileSystem(conf);

            // If meta file already exists, that means the ColumnGroup is complete and
            // valid, we will not proceed.
            checkMetaFile(path);

            // if overwriting, remove everything
            if (overwrite) {
                fs.delete(path, true);
            }

            // create final output path and temporary output path
            checkPath(path, true);

            Path parent = path.getParent();
            Path tmpPath1 = new Path(parent, "_temporary");
            Path tmpPath2 = new Path(tmpPath1, name);
            checkPath(tmpPath2, true);

            cgschema = new CGSchema(schema, sorted, comparator, name, serializer, compressor, owner, group, perm);
            CGSchema sfNew = CGSchema.load(fs, path);
            if (sfNew != null) {
                // sanity check - compare input with on-disk schema.
                if (!sfNew.equals(cgschema)) {
                    throw new IOException("Schema passed in is different from the one on disk");
                }
            } else {
                // create the schema file in FS
                cgschema.create(fs, path);
            }
        }

        /**
         * Reopen an already created ColumnGroup for writing. It accepts
         * a temporary path for column group where cginserter can write.
         * RuntimeException will be thrown if the table is already closed, 
         * or if createMetaBlock() is called by some other process.
         */
        public Writer(Path finalPath, Path workPath, Configuration conf) throws IOException, ParseException {
            this.path = workPath;
            finalOutputPath = finalPath;
            this.conf = conf;
            fs = path.getFileSystem(conf);
            checkPath(finalOutputPath, false);
            checkPath(path, true);
            checkMetaFile(finalOutputPath);
            cgschema = CGSchema.load(fs, finalOutputPath);
        }

        /*
         * Reopen an already created ColumnGroup for writing.
         * It takes in a CGSchema to set its own cgschema instead of going
         * to disk to fetch this information. 
         */
        public Writer(Path finalPath, Path workPath, CGSchema cgschema, Configuration conf)
                throws IOException, ParseException {
            this.path = workPath;
            finalOutputPath = finalPath;
            this.conf = conf;
            fs = path.getFileSystem(conf);
            this.cgschema = cgschema;
        }

        /**
         * Reopen an already created ColumnGroup for writing. RuntimeException will
         * be thrown if the table is already closed, or if createMetaBlock() is
         * called by some other process.
         */
        public Writer(Path path, Configuration conf) throws IOException, ParseException {
            this.path = path;
            finalOutputPath = path;
            this.conf = conf;
            fs = path.getFileSystem(conf);
            checkPath(path, false);
            checkMetaFile(path);
            // read the schema file
            cgschema = CGSchema.load(fs, path);
        }

        /**
         * Release resources used by the object. Unlike close(), finish() does not
         * make the table immutable. However, if a user already adds some meta data
         * into the CG, then finish() would close the column group.
         */
        public void finish() {
            if (!finished) {
                finished = true;
            }
        }

        @Override
        public void close() throws IOException {
            if (!finished) {
                finish();
            }
            if (!closed) {
                closed = true;
                createIndex();
            }
        }

        public Schema getSchema() {
            return cgschema.getSchema();
        }

        /**
         * Get a inserter with a given name.
         * 
         * @param name
         *          the name of the inserter.
         * @param finishWriter
         *          finish the underlying Writer object upon the close of the
         *          Inserter. Should be set to true if there is only one inserter
         *          operate on the table, so we should call finish() after the
         *          Inserter is closed.
         * 
         * @return A table inserter object.
         * @throws IOException
         */
        public TableInserter getInserter(String name, boolean finishWriter) throws IOException {
            return getInserter(name, finishWriter, true);
        }

        /**
         * Get a inserter with a given name.
         * 
         * @param name
         *          the name of the inserter.
         * @param finishWriter
         *          finish the underlying Writer object upon the close of the
         *          Inserter. Should be set to true if there is only one inserter
         *          operate on the table, so we should call finish() after the
         *          Inserter is closed.
         * @param checktype
         *          whether or not do type check.
         * 
         * @return A table inserter object.
         * @throws IOException
         */
        public TableInserter getInserter(String name, boolean finishWriter, boolean checkType) throws IOException {
            if (finished) {
                throw new IOException("ColumnGroup has been closed for insertion.");
            }
            return new CGInserter(name, finishWriter, checkType);
        }

        private void createIndex() throws IOException {
            MetaFile.Writer metaFile = MetaFile.createWriter(makeMetaFilePath(finalOutputPath), conf);
            index = buildIndex(fs, finalOutputPath, false, conf);
            DataOutputStream dos = metaFile.createMetaBlock(BLOCK_NAME_INDEX);
            try {
                index.write(dos);
            } finally {
                dos.close();
            }
            metaFile.close();
        }

        private void checkPath(Path p, boolean createNew) throws IOException {
            // check existence of path
            if (!fs.exists(p)) {
                if (createNew) {
                    fs.mkdirs(p);
                } else {
                    throw new IOException("Path doesn't exists for appending: " + p);
                }
            }
            if (!fs.getFileStatus(p).isDir()) {
                throw new IOException("Path exists but not a directory: " + p);
            }
        }

        private void checkMetaFile(Path p) throws IOException {
            Path pathMeta = new Path(p, META_FILE);
            if (fs.exists(pathMeta)) {
                throw new IOException("Index meta file already exists: " + pathMeta);
            }
        }

        /**
         * Inserter for ColumnGroup
         */
        class CGInserter implements TableInserter {
            String name;
            String tmpName;
            boolean finishWriter;
            FSDataOutputStream out;
            TFile.Writer tfileWriter;
            TupleWriter tupleWriter;
            boolean closed = true;
            boolean checkType = true;

            private void createTempFile() throws IOException {
                int maxTrial = 10;
                String prefix = ".tmp." + name + ".";
                Random random = new Random();

                while (true) {
                    /**
                     * Try to set a real random seed by throwing all the runtime
                     * ingredients into it.
                     */
                    random.setSeed(
                            System.nanoTime() * Thread.currentThread().getId() * Runtime.getRuntime().freeMemory());
                    try {
                        tmpName = prefix + String.format("%08X", random.nextInt());
                        Path tmpPath = new Path(path, tmpName);
                        fs.mkdirs(path);

                        if (cgschema.getOwner() != null || cgschema.getGroup() != null) {
                            fs.setOwner(path, cgschema.getOwner(), cgschema.getGroup());
                        }

                        FsPermission permission = null;
                        if (cgschema.getPerm() != -1) {
                            permission = new FsPermission((short) cgschema.getPerm());
                            fs.setPermission(path, permission);
                        }

                        out = fs.create(tmpPath, false);

                        if (cgschema.getOwner() != null || cgschema.getGroup() != null) {
                            fs.setOwner(tmpPath, cgschema.getOwner(), cgschema.getGroup());
                        }

                        if (cgschema.getPerm() != -1) {
                            fs.setPermission(tmpPath, permission);
                        }
                        return;
                    } catch (IOException e) {
                        --maxTrial;
                        if (maxTrial == 0) {
                            throw e;
                        }
                        Thread.yield();
                    }
                }
            }

            CGInserter(String name, boolean finishWriter, boolean checkType) throws IOException {
                this.name = name;
                this.finishWriter = finishWriter;
                this.tupleWriter = new TupleWriter(getSchema());
                this.checkType = checkType;

                try {
                    createTempFile();
                    tfileWriter = new TFile.Writer(out, getMinBlockSize(conf), cgschema.getCompressor(),
                            cgschema.getComparator(), conf);
                    closed = false;
                } finally {
                    if (closed) {
                        if (tfileWriter != null) {
                            try {
                                tfileWriter.close();
                            } catch (Exception e) {
                                // no-op
                            }
                        }
                        if (out != null) {
                            try {
                                out.close();
                            } catch (Exception e) {
                                // no-op
                            }
                        }
                        if (tmpName != null) {
                            try {
                                fs.delete(new Path(path, tmpName), false);
                            } catch (Exception e) {
                                // no-op
                            }
                        }
                    }
                }
            }

            @Override
            public Schema getSchema() {
                return ColumnGroup.Writer.this.getSchema();
            }

            @Override
            public void insert(BytesWritable key, Tuple row) throws IOException {
                /*
                 * If checkType is set to be true, we check for the first row - this is only a sanity check preventing
                 * users from messing up output schema;
                 * If checkType is set to be false, we do not do any type check. 
                 */
                if (checkType == true) {
                    TypesUtils.checkCompatible(row, getSchema());
                    checkType = false;
                }

                DataOutputStream outKey = tfileWriter.prepareAppendKey(key.getLength());
                try {
                    outKey.write(key.getBytes(), 0, key.getLength());
                } finally {
                    outKey.close();
                }

                DataOutputStream outValue = tfileWriter.prepareAppendValue(-1);
                try {
                    tupleWriter.put(outValue, row);
                } finally {
                    outValue.close();
                }
            }

            @Override
            public void close() throws IOException {
                if (closed) {
                    return;
                }
                closed = true;

                try {
                    // TODO: add schema to each TFile as a meta block?

                    tfileWriter.close();
                    tfileWriter = null;
                    out.close();
                    out = null;
                    // do renaming only if all the above is successful.
                    fs.rename(new Path(path, tmpName), new Path(finalOutputPath, name));

                    /*
                              if(cgschema.getOwner() != null || cgschema.getGroup() != null) {
                                fs.setOwner(new Path(path, name), cgschema.getOwner(), cgschema.getGroup());
                              }  
                              FsPermission permission = null;
                              if(cgschema.getPerm() != -1) {
                                permission = new FsPermission((short) cgschema.getPerm());
                                fs.setPermission(path, permission);
                              }
                    */
                    tmpName = null;
                    if (finishWriter) {
                        finish();
                    }
                } finally {
                    if (tfileWriter != null) {
                        try {
                            tfileWriter.close();
                        } catch (Exception e) {
                            // no-op
                        }
                    }
                    if (out != null) {
                        try {
                            out.close();
                        } catch (Exception e) {
                            // no-op
                        }
                    }
                    if (tmpName != null) {
                        try {
                            fs.delete(new Path(path, tmpName), false);
                        } catch (Exception e) {
                            // no-op
                        }
                    }
                    if (finishWriter) {
                        try {
                            finish();
                        } catch (Exception e) {
                            // no-op
                        }
                    }
                }
            }
        }

    }

    /**
     * name, first and last key (inclusive) of a data file
     */
    static class CGIndexEntry implements RawComparable, Writable {
        int index;
        String name;
        long rows, bytes;
        RawComparable firstKey;
        RawComparable lastKey;

        // for reading
        public CGIndexEntry() {
            // no-op
        }

        // for writing
        public CGIndexEntry(String name, long rows, RawComparable firstKey, RawComparable lastKey) {
            this.name = name;
            this.rows = rows;
            this.firstKey = firstKey;
            this.lastKey = lastKey;
        }

        public int getIndex() {
            return index;
        }

        public String getName() {
            return name;
        }

        public long getRows() {
            return rows;
        }

        public RawComparable getFirstKey() {
            return firstKey;
        }

        public RawComparable getLastKey() {
            return lastKey;
        }

        void setIndex(int idx) {
            this.index = idx;
        }

        @Override
        public byte[] buffer() {
            return (lastKey != null) ? lastKey.buffer() : null;
        }

        @Override
        public int offset() {
            return (lastKey != null) ? lastKey.offset() : 0;
        }

        @Override
        public int size() {
            return (lastKey != null) ? lastKey.size() : 0;
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            name = Utils.readString(in);
            rows = Utils.readVLong(in);
            if (rows == 0) {
                firstKey = null;
                lastKey = null;
            } else {
                int firstKeyLen = Utils.readVInt(in);
                byte[] firstKeyBuffer = new byte[firstKeyLen];
                in.readFully(firstKeyBuffer);
                int lastKeyLen = Utils.readVInt(in);
                byte[] lastKeyBuffer = new byte[lastKeyLen];
                in.readFully(lastKeyBuffer);
                firstKey = new ByteArray(firstKeyBuffer);
                lastKey = new ByteArray(lastKeyBuffer);
            }
        }

        @Override
        public void write(DataOutput out) throws IOException {
            Utils.writeString(out, name);
            Utils.writeVLong(out, rows);
            if (rows > 0) {
                if ((firstKey == null) && (lastKey == null)) {
                    throw new IOException("In-memory only entry");
                }
                Utils.writeVInt(out, firstKey.size());
                out.write(firstKey.buffer(), firstKey.offset(), firstKey.size());
                Utils.writeVInt(out, lastKey.size());
                out.write(lastKey.buffer(), lastKey.offset(), lastKey.size());
            }
        }
    }

    static class CGIndex implements Writable {
        boolean dirty = false;
        boolean sorted = true;
        BasicTableStatus status;
        ArrayList<CGIndexEntry> index;

        CGIndex() {
            status = new BasicTableStatus();
            index = new ArrayList<CGIndexEntry>();
        }

        int getFileIndex(Path path) throws IOException {
            String filename = path.getName();
            if (index.isEmpty())
                return -1;
            for (CGIndexEntry cgie : index) {
                if (cgie.getName().equals(filename)) {
                    return cgie.getIndex();
                }
            }
            return -1;
        }

        int size() {
            return index.size();
        }

        CGIndexEntry get(int i) {
            return index.get(i);
        }

        List<CGIndexEntry> getIndex() {
            return index;
        }

        Path getPath(int i, Path parent) {
            return new Path(parent, index.get(i).getName());
        }

        void sort(final Comparator<RawComparable> comparator) throws IOException {
            if (dirty && comparator != null) {
                throw new IOException("Cannot sort dirty index");
            }

            if (comparator != null) {
                // sort by keys. For empty TFiles, they are always sorted before
                // non-empty TFiles, and they themselves are sorted by their names.
                Collections.sort(index, new Comparator<CGIndexEntry>() {

                    @Override
                    public int compare(CGIndexEntry o1, CGIndexEntry o2) {
                        if ((o1.getRows() == 0) && (o2.getRows() == 0)) {
                            return o1.getName().compareTo(o2.getName());
                        }
                        if (o1.getRows() == 0)
                            return -1;
                        if (o2.getRows() == 0)
                            return 1;
                        int cmprv = comparator.compare(o1.lastKey, o2.lastKey);
                        if (cmprv == 0) {
                            cmprv = comparator.compare(o1.firstKey, o2.firstKey);
                            if (cmprv == 0) {
                                cmprv = o1.getName().compareTo(o2.getName());
                            }
                        }
                        return cmprv;
                    }
                });

                for (int i = 0; i < index.size() - 1; ++i) {
                    RawComparable prevLastKey = index.get(i).lastKey;
                    RawComparable nextFirstKey = index.get(i + 1).firstKey;
                    if (nextFirstKey == null) {
                        continue;
                    }
                    if (comparator.compare(prevLastKey, nextFirstKey) > 0) {
                        throw new IOException("Overlapping key ranges");
                    }
                }
            } else {
                // sort by name
                Collections.sort(index, new Comparator<CGIndexEntry>() {

                    @Override
                    public int compare(CGIndexEntry o1, CGIndexEntry o2) {
                        return o1.name.compareTo(o2.name);
                    }
                });
            }

            // update status
            if ((!dirty) && (index.size() > 0)) {
                RawComparable keyFirst = index.get(0).getFirstKey();
                status.beginKey = new BytesWritable();
                status.beginKey.set(keyFirst.buffer(), keyFirst.offset(), keyFirst.size());
                RawComparable keyLast = index.get(index.size() - 1).getLastKey();
                status.endKey = new BytesWritable();
                status.endKey.set(keyLast.buffer(), keyLast.offset(), keyLast.size());
            }
            sorted = true;
        }

        // building full index.
        void add(long bytes, long rows, CGIndexEntry range) {
            status.size += bytes;
            status.rows += rows;
            index.add(range);
            sorted = false;
            range.bytes = bytes;
        }

        // building dirty index
        void add(long bytes, String name) {
            dirty = true;
            status.rows = -1; // reset rows to -1.
            status.size += bytes;
            CGIndexEntry next = new CGIndexEntry();
            next.name = name;
            index.add(next);
            sorted = false;
            next.bytes = bytes;
        }

        int lowerBound(RawComparable key, final Comparator<RawComparable> comparator) throws IOException {
            if ((key == null) || (comparator == null)) {
                throw new IllegalArgumentException("CGIndex.lowerBound");
            }

            if (!sorted) {
                sort(comparator);
            }

            // Treat null keys as the least key.
            return Utils.lowerBound(index, key, new Comparator<RawComparable>() {
                @Override
                public int compare(RawComparable o1, RawComparable o2) {
                    if ((o1.buffer() == null) && (o2.buffer() == null)) {
                        return 0;
                    }
                    if (o1.buffer() == null)
                        return -1;
                    if (o2.buffer() == null)
                        return 1;
                    return comparator.compare(o1, o2);
                }
            });
        }

        @Override
        public void readFields(DataInput in) throws IOException {
            int n = Utils.readVInt(in);
            index.clear();
            index.ensureCapacity(n);
            for (int i = 0; i < n; ++i) {
                CGIndexEntry range = new CGIndexEntry();
                range.readFields(in);
                range.setIndex(i);
                index.add(range);
            }
            status.readFields(in);
            dirty = false;
            sorted = true;
        }

        @Override
        public void write(DataOutput out) throws IOException {
            if (dirty) {
                throw new IOException("Cannot write dirty index");
            }
            if (!sorted) {
                throw new IOException("Please sort index before calling write");
            }
            Utils.writeVInt(out, index.size());
            for (int i = 0; i < index.size(); ++i) {
                index.get(i).write(out);
            }
            status.write(out);
        }
    }

    public static class CGPathFilter implements PathFilter {
        private static Configuration conf;

        public static void setConf(Configuration c) {
            conf = c;
        }

        public boolean accept(Path p) {
            return p.getName().equals(META_FILE) || p.getName().equals(SCHEMA_FILE)
                    || p.getName().startsWith(".tmp.") || p.getName().startsWith("_")
                    || p.getName().startsWith("ttt") || p.getName().startsWith(getNonDataFilePrefix(conf)) ? false
                            : true;
        }
    }

    /**
     * Dump information about CG.
     * 
     * @param file
     *          Path string of the CG
     * @param out
     *          PrintStream to output the information.
     * @param conf
     *          The configuration object.
     * @throws IOException
     */
    static public void dumpInfo(String file, PrintStream out, Configuration conf) throws IOException, Exception {
        // final int maxKeySampleLen = 16;
        dumpInfo(new Path(file), out, conf);
    }

    static public void dumpInfo(Path path, PrintStream out, Configuration conf) throws IOException, Exception {
        dumpInfo(path, out, conf, 0);
    }

    static public void dumpInfo(Path path, PrintStream out, Configuration conf, int indent)
            throws IOException, Exception {
        // final int maxKeySampleLen = 16;
        IOutils.indent(out, indent);
        out.println();
        IOutils.indent(out, indent);
        out.println("Column Group : " + path);
        ColumnGroup.Reader reader = new ColumnGroup.Reader(path, false, conf);
        try {
            LinkedHashMap<String, String> properties = new LinkedHashMap<String, String>();
            IOutils.indent(out, indent);
            out.println("Name: " + reader.getName());
            IOutils.indent(out, indent);
            out.println("Serializer: " + reader.getSerializer());
            IOutils.indent(out, indent);
            out.println("Compressor: " + reader.getCompressor());
            IOutils.indent(out, indent);
            out.println("Group: " + reader.getGroup());
            IOutils.indent(out, indent);
            out.println("Perm: " + reader.getPerm());

            properties.put("Schema", reader.getSchema().toString());
            // Now output the properties table.
            int maxKeyLength = 0;
            Set<Map.Entry<String, String>> entrySet = properties.entrySet();
            for (Iterator<Map.Entry<String, String>> it = entrySet.iterator(); it.hasNext();) {
                Map.Entry<String, String> e = it.next();
                if (e.getKey().length() > maxKeyLength) {
                    maxKeyLength = e.getKey().length();
                }
            }
            for (Iterator<Map.Entry<String, String>> it = entrySet.iterator(); it.hasNext();) {
                Map.Entry<String, String> e = it.next();
                IOutils.indent(out, indent);
                out.printf("%s : %s\n", e.getKey(), e.getValue());
            }
            out.println("TFiles within the Column Group :");
            if (reader.cgindex == null)
                reader.cgindex = buildIndex(reader.fs, reader.path, reader.dirty, conf);
            for (CGIndexEntry entry : reader.cgindex.index) {
                IOutils.indent(out, indent);
                out.printf(" *Name : %s\n", entry.name);
                IOutils.indent(out, indent);
                out.printf("  Rows : %d\n", entry.rows);
                if (entry.firstKey != null) {
                    IOutils.indent(out, indent);
                    out.printf("  First Key : %s\n", headToString(entry.firstKey));
                }
                if (entry.lastKey != null) {
                    IOutils.indent(out, indent);
                    out.printf("  Larst Key : %s\n", headToString(entry.lastKey));
                }
                // dump TFile info
                // Path pathTFile = new Path(path, entry.name);
                // TFile.dumpInfo(pathTFile.toString(), out, conf);
            }
        } finally {
            try {
                reader.close();
            } catch (Exception e) {
                // no-op
            }
        }
    }

    private static String headToString(RawComparable raw) {
        return new String(raw.buffer(), raw.offset(), raw.size() > 70 ? 70 : raw.size());
    }

    /**
     * Dumping the CG information.
     * 
     * @param args
     *          A list of CG paths.
     */
    public static void main(String[] args) throws Exception {
        System.out.printf("ColumnGroup Dumper\n");
        if (args.length == 0) {
            System.out.println("Usage: java ... org.apache.hadoop.zebra.io.ColumnGroup cg-path [cg-path ...]");
            System.exit(0);
        }
        Configuration conf = new Configuration();
        for (String file : args) {
            try {
                dumpInfo(file, System.out, conf);
            } catch (IOException e) {
                e.printStackTrace(System.err);
            }
        }
    }
}