org.apache.hadoop.zebra.io.BasicTable.java Source code

Introduction

Here is the source code for org.apache.hadoop.zebra.io.BasicTable.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.hadoop.zebra.io;

import java.io.Closeable;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.zebra.tfile.TFile;
import org.apache.hadoop.zebra.tfile.Utils;
import org.apache.hadoop.zebra.tfile.MetaBlockAlreadyExists;
import org.apache.hadoop.zebra.tfile.MetaBlockDoesNotExist;
import org.apache.hadoop.zebra.tfile.Utils.Version;
import org.apache.hadoop.zebra.io.ColumnGroup.Reader.CGRangeSplit;
import org.apache.hadoop.zebra.io.ColumnGroup.Reader.CGRowSplit;
import org.apache.hadoop.zebra.io.ColumnGroup.Reader.CGScanner;
import org.apache.hadoop.zebra.types.CGSchema;
import org.apache.hadoop.zebra.mapreduce.BasicTableOutputFormat;
import org.apache.hadoop.zebra.parser.ParseException;
import org.apache.hadoop.zebra.types.Partition;
import org.apache.hadoop.zebra.types.Projection;
import org.apache.hadoop.zebra.types.ZebraConf;
import org.apache.hadoop.zebra.schema.Schema;
import org.apache.hadoop.zebra.parser.TableSchemaParser;
import org.apache.hadoop.zebra.pig.TableStorer;
import org.apache.hadoop.zebra.types.TypesUtils;
import org.apache.hadoop.zebra.types.SortInfo;
import org.apache.pig.data.Tuple;

/**
 * A materialized table that consists of one or more tightly coupled Column
 * Groups.
 * 
 * The following Configuration parameters can customize the behavior of
 * BasicTable.
 * <ul>
 * <li><b>table.output.tfile.minBlock.size</b> (int) Minimum compression block
 * size for underlying TFile (default to 1024*1024).
 * <li><b>table.output.tfile.compression</b> (String) Compression method (one of
 * "none", "lzo", "gz") (default is "gz"). @see
 * {@link TFile#getSupportedCompressionAlgorithms()}
 * <li><b>table.input.split.minSize</b> (int) Minimum split size (default to
 * 64*1024).
 * </ul>
 */
public class BasicTable {

    static Log LOG = LogFactory.getLog(BasicTable.class);

    // name of the BasicTable schema file
    private final static String BT_SCHEMA_FILE = ".btschema";
    // schema version
    private final static Version SCHEMA_VERSION = new Version((short) 1, (short) 1);
    // name of the BasicTable meta-data file
    private final static String BT_META_FILE = ".btmeta";

    private final static String DELETED_CG_PREFIX = ".deleted-";

    public final static String DELETED_CG_SEPARATOR_PER_TABLE = ",";

    // no public ctor for instantiating a BasicTable object
    private BasicTable() {
        // no-op
    }

    /**
     * Deletes the data for column group specified by cgName.
     * When the readers try to read the fields that were stored in the
     * column group get null since the underlying data is removed.
     * <br> <br>
     * 
     * Effect on the readers that are currently reading from the table while
     * a column group is droped is unspecified. Suggested practice is to 
     * drop column groups when there are no readers or writes for the table.
     * <br> <br>
     * 
     * Column group names are usually specified in the "storage hint" while
     * creating a table. If no name is specified, system assigns a simple name.
     * These names could be obtained through "dumpInfo()" and other methods.
     * <br> <br> 
     *
     * Dropping a column group that has already been removed is a no-op no 
     * exception is thrown.
     * <br> <br> 
     * 
     * Note that this feature is experimental now and subject to changes in the
     * future.
     *
     * @param path path to BasicTable
     * @param conf Configuration determines file system and other parameters.
     * @param cgName name of the column group to drop.
     * @throws IOException IOException could occur for various reasons. E.g.
     *         a user does not have permissions to write to table directory.
     *         
     */
    public static void dropColumnGroup(Path path, Configuration conf, String cgName) throws IOException {

        FileSystem fs = FileSystem.get(conf);
        int triedCount = 0;
        int numCGs = SchemaFile.getNumCGs(path, conf);
        SchemaFile schemaFile = null;

        /* Retry up to numCGs times accounting for other CG deleting threads or processes.*/
        while (triedCount++ < numCGs) {
            try {
                schemaFile = new SchemaFile(path, null, conf);
                break;
            } catch (FileNotFoundException e) {
                LOG.info("Try " + triedCount + " times : " + e.getMessage());
            } catch (Exception e) {
                throw new IOException("Cannot construct SchemaFile : " + e.getMessage());
            }
        }

        if (schemaFile == null) {
            throw new IOException("Cannot construct SchemaFile");
        }

        int cgIdx = schemaFile.getCGByName(cgName);
        if (cgIdx < 0) {
            throw new IOException(path + " : Could not find a column group with the name '" + cgName + "'");
        }

        Path cgPath = new Path(path, schemaFile.getName(cgIdx));

        //Clean up any previous unfinished attempts to drop column groups?    
        if (schemaFile.isCGDeleted(cgIdx)) {
            // Clean up unfinished delete if it exists. so that clean up can 
            // complete if the previous deletion was interrupted for some reason.
            if (fs.exists(cgPath)) {
                LOG.info(path + " : " + " clearing unfinished deletion of column group " + cgName + ".");
                fs.delete(cgPath, true);
            }
            LOG.info(path + " : column group " + cgName + " is already deleted.");
            return;
        }

        // try to delete the column group:

        // first check if the user has enough permissions to list the directory
        fs.listStatus(cgPath);

        //verify if the user has enough permissions by trying to create
        //a temporary file in cg.
        OutputStream out = fs.create(new Path(cgPath, ".tmp" + DELETED_CG_PREFIX + cgName), true);
        out.close();

        //First try to create a file indicating a column group is deleted.
        try {
            Path deletedCGPath = new Path(path, DELETED_CG_PREFIX + cgName);
            // create without overriding.
            out = fs.create(deletedCGPath, false);
            // should we write anything?
            out.close();
        } catch (IOException e) {
            // one remote possibility is that another user 
            // already deleted CG. 
            SchemaFile tempSchema = new SchemaFile(path, null, conf);
            if (tempSchema.isCGDeleted(cgIdx)) {
                LOG.info(path + " : " + cgName + " is deleted by someone else. That is ok.");
                return;
            }
            // otherwise, it is some other error.
            throw e;
        }

        // At this stage, the CG is marked deleted. Now just try to
        // delete the actual directory:
        if (!fs.delete(cgPath, true)) {
            String msg = path + " : Could not detete column group " + cgName + ". It is marked deleted.";
            LOG.warn(msg);
            throw new IOException(msg);
        }

        LOG.info("Dropped " + cgName + " from " + path);
    }

    /**
     * BasicTable reader.
     */
    public static class Reader implements Closeable {
        private Path path;
        private boolean closed = true;
        private SchemaFile schemaFile;
        private Projection projection;
        boolean inferredMapping;
        private MetaFile.Reader metaReader;
        private BasicTableStatus status;
        private int firstValidCG = -1; /// First column group that exists.
        private int rowSplitCGIndex = -1;
        Partition partition;
        ColumnGroup.Reader[] colGroups;
        Tuple[] cgTuples;

        private synchronized void checkInferredMapping() throws ParseException, IOException {
            if (!inferredMapping) {
                for (int i = 0; i < colGroups.length; ++i) {
                    if (colGroups[i] != null) {
                        colGroups[i].setProjection(partition.getProjection(i));
                    }
                    if (partition.isCGNeeded(i)) {
                        if (isCGDeleted(i)) {
                            // this is a deleted column group. Warn about it.
                            LOG.warn("Trying to read from deleted column group " + schemaFile.getName(i)
                                    + ". NULL is returned for corresponding columns. " + "Table at " + path);
                        } else {
                            cgTuples[i] = TypesUtils.createTuple(colGroups[i].getSchema());
                        }
                    } else
                        cgTuples[i] = null;
                }
                partition.setSource(cgTuples);
                inferredMapping = true;
            } else {
                // the projection is not changed, so we do not need to recalculate the
                // mapping
            }
        }

        /**
         * Returns true if a column group is deleted.
         */
        private boolean isCGDeleted(int nx) {
            return colGroups[nx] == null;
        }

        /**
         * Create a BasicTable reader.
         * 
         * @param path
         *          The directory path to the BasicTable.
         * @param conf
         *          Optional configuration parameters.
         * @throws IOException
         */

        public Reader(Path path, Configuration conf) throws IOException {
            this(path, null, conf);
        }

        public Reader(Path path, String[] deletedCGs, Configuration conf) throws IOException {
            try {
                boolean mapper = (deletedCGs != null);
                this.path = path;
                schemaFile = new SchemaFile(path, deletedCGs, conf);
                metaReader = MetaFile.createReader(new Path(path, BT_META_FILE), conf);
                // create column group readers
                int numCGs = schemaFile.getNumOfPhysicalSchemas();
                Schema schema;
                colGroups = new ColumnGroup.Reader[numCGs];
                cgTuples = new Tuple[numCGs];
                // set default projection that contains everything
                schema = schemaFile.getLogical();
                projection = new Projection(schema);
                String storage = schemaFile.getStorageString();
                String comparator = schemaFile.getComparator();
                partition = new Partition(schema, projection, storage, comparator);
                for (int nx = 0; nx < numCGs; nx++) {
                    if (!schemaFile.isCGDeleted(nx)) {
                        colGroups[nx] = new ColumnGroup.Reader(new Path(path, partition.getCGSchema(nx).getName()),
                                conf, mapper);
                        if (firstValidCG < 0) {
                            firstValidCG = nx;
                        }
                    }
                    if (colGroups[nx] != null && partition.isCGNeeded(nx))
                        cgTuples[nx] = TypesUtils.createTuple(colGroups[nx].getSchema());
                    else
                        cgTuples[nx] = null;
                }
                closed = false;
            } catch (Exception e) {
                throw new IOException("BasicTable.Reader constructor failed : " + e.getMessage());
            } finally {
                if (closed) {
                    /**
                     * Construction fails.
                     */
                    if (colGroups != null) {
                        for (int i = 0; i < colGroups.length; ++i) {
                            if (colGroups[i] != null) {
                                try {
                                    colGroups[i].close();
                                } catch (Exception e) {
                                    // ignore error
                                }
                            }
                        }
                    }
                    if (metaReader != null) {
                        try {
                            metaReader.close();
                        } catch (Exception e) {
                            // no-op
                        }
                    }
                }
            }
        }

        /**
         * Is the Table sorted?
         * 
         * @return Whether the table is sorted.
         */
        public boolean isSorted() {
            return schemaFile.isSorted();
        }

        /**
         * @return the list of sorted columns
         */
        public SortInfo getSortInfo() {
            return schemaFile.getSortInfo();
        }

        /**
         * @return the name of i-th column group 
         */
        public String getName(int i) {
            return schemaFile.getName(i);
        }

        /**
         * Set the projection for the reader. This will affect calls to
         * {@link #getScanner(RangeSplit, boolean)},
         * {@link #getScanner(BytesWritable, BytesWritable, boolean)},
         * {@link #getStatus()}, {@link #getSchema()}.
         * 
         * @param projection
         *          The projection on the BasicTable for subsequent read operations.
         *          For this version of implementation, the projection is a comma
         *          separated list of column names, such as
         *          "FirstName, LastName, Sex, Department". If we want select all
         *          columns, pass projection==null.
         * @throws IOException
         */
        public synchronized void setProjection(String projection) throws ParseException, IOException {
            if (projection == null) {
                this.projection = new Projection(schemaFile.getLogical());
                partition = new Partition(schemaFile.getLogical(), this.projection, schemaFile.getStorageString(),
                        schemaFile.getComparator());
            } else {
                /**
                 * the typed schema from projection which is untyped or actually typed
                 * as "bytes"
                 */
                this.projection = new Projection(schemaFile.getLogical(), projection);
                partition = new Partition(schemaFile.getLogical(), this.projection, schemaFile.getStorageString(),
                        schemaFile.getComparator());
            }
            inferredMapping = false;
        }

        /**
         * Get the status of the BasicTable.
         */
        public BasicTableStatus getStatus() throws IOException {
            if (status == null)
                buildStatus();
            return status;
        }

        /**
         * Given a split range, calculate how the file data that fall into the range
         * are distributed among hosts.
         * 
         * @param split
         *          The range-based split. Can be null to indicate the whole TFile.
         * @return An object that conveys how blocks fall in the split are
         *         distributed across hosts.
         * @see #rangeSplit(int)
         */
        public BlockDistribution getBlockDistribution(RangeSplit split) throws IOException {
            BlockDistribution bd = new BlockDistribution();
            if (firstValidCG >= 0) {
                for (int nx = 0; nx < colGroups.length; nx++) {
                    if (partition.isCGNeeded(nx) && !isCGDeleted(nx)) {
                        bd.add(colGroups[nx].getBlockDistribution(split == null ? null : split.getCGRangeSplit()));
                    }
                }
            }
            return bd;
        }

        /**
         * Given a row-based split, calculate how the file data that fall into the split
         * are distributed among hosts.
         * 
         * @param split The row-based split. <i>Cannot</i> be null.
         * @return An object that conveys how blocks fall into the split are
         *         distributed across hosts.
         */
        public BlockDistribution getBlockDistribution(RowSplit split) throws IOException {
            BlockDistribution bd = new BlockDistribution();
            int cgIdx = split.getCGIndex();
            bd.add(colGroups[cgIdx].getBlockDistribution(split.getCGRowSplit()));

            return bd;
        }

        /**
         * Collect some key samples and use them to partition the table. Only
         * applicable to sorted BasicTable. The returned {@link KeyDistribution}
         * object also contains information on how data are distributed for each
         * key-partitioned bucket.
         * 
         * @param n
         *          Targeted size of the sampling.
         * @param nTables
         *          Number of tables in union
         * @return KeyDistribution object.
         * @throws IOException
         */
        public KeyDistribution getKeyDistribution(int n, int nTables, BlockDistribution lastBd) throws IOException {
            if (firstValidCG >= 0) {
                // pick the largest CG as in the row split case
                return colGroups[getRowSplitCGIndex()].getKeyDistribution(n, nTables, lastBd);
            }
            return null;
        }

        /**
         * Get a scanner that reads all rows whose row keys fall in a specific
         * range. Only applicable to sorted BasicTable.
         * 
         * @param beginKey
         *          The begin key of the scan range. If null, start from the first
         *          row in the table.
         * @param endKey
         *          The end key of the scan range. If null, scan till the last row
         *          in the table.
         * @param closeReader
         *          close the underlying Reader object when we close the scanner.
         *          Should be set to true if we have only one scanner on top of the
         *          reader, so that we should release resources after the scanner is
         *          closed.
         * @return A scanner object.
         * @throws IOException
         */
        public synchronized TableScanner getScanner(BytesWritable beginKey, BytesWritable endKey,
                boolean closeReader) throws IOException {
            try {
                checkInferredMapping();
            } catch (Exception e) {
                throw new IOException("getScanner failed : " + e.getMessage());
            }
            return new BTScanner(beginKey, endKey, closeReader, partition);
        }

        /**
         * Get a scanner that reads a consecutive number of rows as defined in the
         * {@link RangeSplit} object, which should be obtained from previous calls
         * of {@link #rangeSplit(int)}.
         * 
         * @param split
         *          The split range. If null, get a scanner to read the complete
         *          table.
         * @param closeReader
         *          close the underlying Reader object when we close the scanner.
         *          Should be set to true if we have only one scanner on top of the
         *          reader, so that we should release resources after the scanner is
         *          closed.
         * @return A scanner object.
         * @throws IOException
         */
        public synchronized TableScanner getScanner(RangeSplit split, boolean closeReader)
                throws IOException, ParseException {
            checkInferredMapping();
            return new BTScanner(split, partition, closeReader);
        }

        /**
         * Get a scanner that reads a consecutive number of rows as defined in the
         * {@link RowSplit} object.
         * 
         * @param closeReader
         *          close the underlying Reader object when we close the scanner.
         *          Should be set to true if we have only one scanner on top of the
         *          reader, so that we should release resources after the scanner is
         *          closed.
         * @param rowSplit split based on row numbers.
         * 
         * @return A scanner object.
         * @throws IOException
         */
        public synchronized TableScanner getScanner(boolean closeReader, RowSplit rowSplit)
                throws IOException, ParseException, ParseException {
            checkInferredMapping();
            return new BTScanner(rowSplit, closeReader, partition);
        }

        /**
         * Get the schema of the table. The schema may be different from
         * {@link BasicTable.Reader#getSchema(Path, Configuration)} if a projection
         * has been set on the table.
         * 
         * @return The schema of the BasicTable.
         */
        public Schema getSchema() {
            return projection.getSchema();
        }

        /**
         * Get the BasicTable schema without loading the full table index.
         * 
         * @param path
         *          The path to the BasicTable.
         * @deletedCGs
         *          The deleted column groups from front end; null if unavailable from front end
         * @param conf
         * @return The logical Schema of the table (all columns).
         * @throws IOException
         */
        public static Schema getSchema(Path path, Configuration conf) throws IOException {
            // fake an empty deleted cg list as getSchema does not care about deleted cgs
            SchemaFile schF = new SchemaFile(path, new String[0], conf);
            return schF.getLogical();
        }

        /**
         * Get the path to the table.
         * 
         * @return The path string to the table.
         */
        public String getPath() {
            return path.toString();
        }

        /**
         * Get the path filter used by the table.
         */
        public PathFilter getPathFilter(Configuration conf) {
            ColumnGroup.CGPathFilter filter = new ColumnGroup.CGPathFilter();
            ColumnGroup.CGPathFilter.setConf(conf);
            return filter;
        }

        /**
         * Split the table into at most n parts.
         * 
         * @param n Maximum number of parts in the output list.
         * @return A list of RangeSplit objects, each of which can be used to
         *         construct TableScanner later.
         */
        public List<RangeSplit> rangeSplit(int n) throws IOException {
            // use the first non-deleted column group to do split, other column groups will be split exactly the same way.
            List<RangeSplit> ret;
            if (firstValidCG >= 0) {
                List<CGRangeSplit> cgSplits = colGroups[firstValidCG].rangeSplit(n);
                int numSlices = cgSplits.size();
                ret = new ArrayList<RangeSplit>(numSlices);
                for (int slice = 0; slice < numSlices; slice++) {
                    CGRangeSplit oneSliceSplit = cgSplits.get(slice);
                    ret.add(new BasicTable.Reader.RangeSplit(oneSliceSplit));
                }

                return ret;
            } else { // all column groups are dropped.
                ret = new ArrayList<RangeSplit>(1);
                // add a dummy split
                ret.add(new BasicTable.Reader.RangeSplit(new CGRangeSplit(0, 0)));
                return ret;
            }
        }

        /**
         * We already use FileInputFormat to create byte offset-based input splits.
         * Their information is encoded in starts, lengths and paths. This method is 
         * to wrap this information to form RowSplit objects at basic table level.
         * 
         * @param starts array of starting byte of fileSplits.
         * @param lengths array of length of fileSplits.
         * @param paths array of path of fileSplits.
         * @param splitCGIndex index of column group that is used to create fileSplits.
         * @return A list of RowSplit objects, each of which can be used to
         *         construct a TableScanner later. 
         *         
         */
        public List<RowSplit> rowSplit(long[] starts, long[] lengths, Path[] paths, int splitCGIndex,
                int[] batchSizes, int numBatches) throws IOException {
            List<RowSplit> ret;
            List<CGRowSplit> cgSplits = colGroups[splitCGIndex].rowSplit(starts, lengths, paths, batchSizes,
                    numBatches);
            int numSlices = cgSplits.size();
            ret = new ArrayList<RowSplit>(numSlices);
            for (int slice = 0; slice < numSlices; slice++) {
                CGRowSplit cgRowSplit = cgSplits.get(slice);
                ret.add(new BasicTable.Reader.RowSplit(splitCGIndex, cgRowSplit));
            }

            return ret;
        }

        /**
         * Rearrange the files according to the column group index ordering
         * 
         * @param filestatus array of FileStatus to be rearraged on 
         */
        public void rearrangeFileIndices(FileStatus[] fileStatus) throws IOException {
            colGroups[getRowSplitCGIndex()].rearrangeFileIndices(fileStatus);
        }

        /** 
         * Get index of the column group that will be used for row-based split. 
         * 
         */
        public int getRowSplitCGIndex() throws IOException {
            // Try to find the largest non-deleted and used column group by projection;
            // Try to find the largest non-deleted and used column group by projection;
            if (rowSplitCGIndex == -1) {
                int largestCGIndex = -1;
                long largestCGSize = -1;
                for (int i = 0; i < colGroups.length; i++) {
                    if (!partition.isCGNeeded(i) || isCGDeleted(i)) {
                        continue;
                    }
                    ColumnGroup.Reader reader = colGroups[i];
                    BasicTableStatus btStatus = reader.getStatus();
                    long size = btStatus.getSize();
                    if (size > largestCGSize) {
                        largestCGIndex = i;
                        largestCGSize = size;
                    }
                }

                /* We do have a largest non-deleted and used column group,
                and we use it to do split. */
                if (largestCGIndex >= 0) {
                    rowSplitCGIndex = largestCGIndex;
                } else if (firstValidCG >= 0) { /* If all projection columns are either deleted or non-existing,
                                                then we use the first non-deleted column group to do split if it exists. */
                    rowSplitCGIndex = firstValidCG;
                }
            }
            return rowSplitCGIndex;
        }

        /**
         * Close the BasicTable for reading. Resources are released.
         */
        @Override
        public void close() throws IOException {
            if (!closed) {
                try {
                    closed = true;
                    metaReader.close();
                    for (int i = 0; i < colGroups.length; ++i) {
                        if (colGroups[i] != null) {
                            colGroups[i].close();
                        }
                    }
                } finally {
                    try {
                        metaReader.close();
                    } catch (Exception e) {
                        // no-op
                    }
                    for (int i = 0; i < colGroups.length; ++i) {
                        try {
                            colGroups[i].close();
                        } catch (Exception e) {
                            // no-op
                        }
                    }
                }
            }
        }

        String getBTSchemaString() {
            return schemaFile.getBTSchemaString();
        }

        String getStorageString() {
            return schemaFile.getStorageString();
        }

        public String getDeletedCGs() {
            return schemaFile.getDeletedCGs();
        }

        public static String getDeletedCGs(Path path, Configuration conf) throws IOException {
            SchemaFile schF = new SchemaFile(path, new String[0], conf);
            return schF.getDeletedCGs();
        }

        private void buildStatus() throws IOException {
            status = new BasicTableStatus();
            if (firstValidCG >= 0) {
                status.beginKey = colGroups[firstValidCG].getStatus().getBeginKey();
                status.endKey = colGroups[firstValidCG].getStatus().getEndKey();
                status.rows = colGroups[firstValidCG].getStatus().getRows();
            } else {
                status.beginKey = new BytesWritable(new byte[0]);
                status.endKey = status.beginKey;
                status.rows = 0;
            }
            status.size = 0;
            for (int nx = 0; nx < colGroups.length; nx++) {
                if (colGroups[nx] != null) {
                    status.size += colGroups[nx].getStatus().getSize();
                }
            }
        }

        /**
         * Obtain an input stream for reading a meta block.
         * 
         * @param name
         *          The name of the meta block.
         * @return The input stream for reading the meta block.
         * @throws IOException
         * @throws MetaBlockDoesNotExist
         */
        public DataInputStream getMetaBlock(String name) throws MetaBlockDoesNotExist, IOException {
            return metaReader.getMetaBlock(name);
        }

        /**
         * A range-based split on the metaReadertable.The content of the split is
         * implementation-dependent.
         */
        public static class RangeSplit implements Writable {
            //CGRangeSplit[] slice;
            CGRangeSplit slice;

            RangeSplit(CGRangeSplit split) {
                slice = split;
            }

            /**
             * Default constructor.
             */
            public RangeSplit() {
                // no-op
            }

            /**
             * @see Writable#readFields(DataInput)
             */
            @Override
            public void readFields(DataInput in) throws IOException {
                for (int nx = 0; nx < 1; nx++) {
                    CGRangeSplit cgrs = new CGRangeSplit();
                    cgrs.readFields(in);
                    slice = cgrs;
                }
            }

            /**
             * @see Writable#write(DataOutput)
             */
            @Override
            public void write(DataOutput out) throws IOException {
                //Utils.writeVInt(out, slice.length);
                //for (CGRangeSplit split : slice) {
                //  split.write(out);
                //}
                slice.write(out);
            }

            //CGRangeSplit get(int index) {
            // return slice[index];
            //}

            CGRangeSplit getCGRangeSplit() {
                return slice;
            }
        }

        /**
         * A row-based split on the zebra table;
         */
        public static class RowSplit implements Writable {
            int cgIndex; // column group index where split lies on;
            CGRowSplit slice;

            RowSplit(int cgidx, CGRowSplit split) {
                this.cgIndex = cgidx;
                this.slice = split;
            }

            /**
             * Default constructor.
             */
            public RowSplit() {
                // no-op
            }

            @Override
            public String toString() {
                StringBuilder sb = new StringBuilder();
                sb.append("{cgIndex = " + cgIndex + "}\n");
                sb.append(slice.toString());

                return sb.toString();
            }

            /**
             * @see Writable#readFields(DataInput)
             */
            @Override
            public void readFields(DataInput in) throws IOException {
                this.cgIndex = Utils.readVInt(in);
                CGRowSplit cgrs = new CGRowSplit();
                cgrs.readFields(in);
                this.slice = cgrs;
            }

            /**
             * @see Writable#write(DataOutput)
             */
            @Override
            public void write(DataOutput out) throws IOException {
                Utils.writeVInt(out, cgIndex);
                slice.write(out);
            }

            int getCGIndex() {
                return cgIndex;
            }

            CGRowSplit getCGRowSplit() {
                return slice;
            }
        }

        /**
         * BasicTable scanner class
         */
        private class BTScanner implements TableScanner {
            private Projection schema;
            private CGScanner[] cgScanners;
            private int opCount = 0;
            Random random = new Random(System.nanoTime());
            // checking for consistency once every 1000 times.
            private static final int VERIFY_FREQ = 1000;
            private boolean sClosed = false;
            private boolean closeReader;
            private Partition partition;

            private synchronized boolean checkIntegrity() {
                return ((++opCount % VERIFY_FREQ) == 0) && (cgScanners.length > 1);
            }

            public BTScanner(BytesWritable beginKey, BytesWritable endKey, boolean closeReader, Partition partition)
                    throws IOException {
                init(null, null, beginKey, endKey, closeReader, partition);
            }

            public BTScanner(RangeSplit split, Partition partition, boolean closeReader) throws IOException {
                init(null, split, null, null, closeReader, partition);
            }

            public BTScanner(RowSplit rowSplit, boolean closeReader, Partition partition) throws IOException {
                init(rowSplit, null, null, null, closeReader, partition);
            }

            /**
             * Creates new CGRowSplit. If the startRow in rowSplit is not set 
             * (i.e. < 0), it sets the startRow and numRows based on 'startByte' 
             * and 'numBytes' from given rowSplit.
             */
            private CGRowSplit makeCGRowSplit(RowSplit rowSplit) throws IOException {
                CGRowSplit inputCGSplit = rowSplit.getCGRowSplit();

                int cgIdx = rowSplit.getCGIndex();

                CGRowSplit cgSplit = new CGRowSplit();

                // Find the row range :
                if (isCGDeleted(cgIdx)) {
                    throw new IOException("CG " + cgIdx + " is deleted.");
                }
                //fill the row numbers.
                colGroups[cgIdx].fillRowSplit(cgSplit, inputCGSplit);
                return cgSplit;
            }

            // Helper function for initialization.
            private CGScanner createCGScanner(int cgIndex, CGRowSplit cgRowSplit, RangeSplit rangeSplit,
                    BytesWritable beginKey, BytesWritable endKey)
                    throws IOException, ParseException, ParseException {
                if (cgRowSplit != null) {
                    return colGroups[cgIndex].getScanner(false, cgRowSplit);
                }
                if (beginKey != null || endKey != null) {
                    return colGroups[cgIndex].getScanner(beginKey, endKey, false);
                }
                return colGroups[cgIndex].getScanner((rangeSplit == null ? null : rangeSplit.getCGRangeSplit()),
                        false);
            }

            /**
             * If rowRange is not null, scanners will be created based on the 
             * row range. <br>
             * If RangeSplit is not null, scaller will be based on the range, <br>
             * otherwise, these are based on keys.
             */
            private void init(RowSplit rowSplit, RangeSplit rangeSplit, BytesWritable beginKey,
                    BytesWritable endKey, boolean closeReader, Partition partition) throws IOException {
                this.partition = partition;
                boolean anyScanner = false;

                CGRowSplit cgRowSplit = null;
                if (rowSplit != null) {
                    cgRowSplit = makeCGRowSplit(rowSplit);
                }

                try {
                    schema = partition.getProjection();
                    cgScanners = new CGScanner[colGroups.length];
                    for (int i = 0; i < colGroups.length; ++i) {
                        if (!isCGDeleted(i) && partition.isCGNeeded(i)) {
                            anyScanner = true;
                            cgScanners[i] = createCGScanner(i, cgRowSplit, rangeSplit, beginKey, endKey);
                        } else
                            cgScanners[i] = null;
                    }
                    if (!anyScanner && firstValidCG >= 0) {
                        // if no CG is needed explicitly by projection but the "countRow" still needs to access some column group
                        cgScanners[firstValidCG] = createCGScanner(firstValidCG, cgRowSplit, rangeSplit, beginKey,
                                endKey);
                    }
                    this.closeReader = closeReader;
                    sClosed = false;
                } catch (Exception e) {
                    throw new IOException("BTScanner constructor failed : " + e.getMessage());
                } finally {
                    if (sClosed) {
                        if (cgScanners != null) {
                            for (int i = 0; i < cgScanners.length; ++i) {
                                if (cgScanners[i] != null) {
                                    try {
                                        cgScanners[i].close();
                                        cgScanners[i] = null;
                                    } catch (Exception e) {
                                        // no-op
                                    }
                                }
                            }
                        }
                    }
                }
            }

            @Override
            public boolean advance() throws IOException {
                boolean first = false, cur, firstAdvance = true;
                for (int nx = 0; nx < cgScanners.length; nx++) {
                    if (cgScanners[nx] != null) {
                        cur = cgScanners[nx].advanceCG();
                        if (!firstAdvance) {
                            if (cur != first) {
                                throw new IOException("advance() failed: Column Groups are not evenly positioned.");
                            }
                        } else {
                            firstAdvance = false;
                            first = cur;
                        }
                    }
                }
                return first;
            }

            @Override
            public boolean atEnd() throws IOException {
                boolean ret = true;
                int i;
                for (i = 0; i < cgScanners.length; i++) {
                    if (cgScanners[i] != null) {
                        ret = cgScanners[i].atEnd();
                        break;
                    }
                }

                if (i == cgScanners.length) {
                    return true;
                }

                if (!checkIntegrity()) {
                    return ret;
                }

                while (true) {
                    int index = random.nextInt(cgScanners.length);
                    if (cgScanners[index] != null) {
                        if (cgScanners[index].atEnd() != ret) {
                            throw new IOException("atEnd() failed: Column Groups are not evenly positioned.");
                        }
                        break;
                    }
                }
                return ret;
            }

            @Override
            public void getKey(BytesWritable key) throws IOException {
                int i;
                for (i = 0; i < cgScanners.length; i++) {
                    if (cgScanners[i] != null) {
                        cgScanners[i].getCGKey(key);
                        break;
                    }
                }

                if (i == cgScanners.length)
                    return;

                if (!checkIntegrity()) {
                    return;
                }

                while (true) {
                    int index = random.nextInt(cgScanners.length);
                    if (cgScanners[index] != null) {
                        BytesWritable key2 = new BytesWritable();
                        cgScanners[index].getCGKey(key2);
                        if (key.equals(key2)) {
                            return;
                        }
                        break;
                    }
                }
                throw new IOException("getKey() failed: Column Groups are not evenly positioned.");
            }

            @Override
            public void getValue(Tuple row) throws IOException {
                if (row.size() < projection.getSchema().getNumColumns()) {
                    throw new IOException("Mismatched tuple object");
                }

                for (int i = 0; i < cgScanners.length; ++i) {
                    if (cgScanners[i] != null) {
                        if (partition.isCGNeeded(i)) {
                            if (cgTuples[i] == null)
                                throw new AssertionError("cgTuples[" + i + "] is null");
                            cgScanners[i].getCGValue(cgTuples[i]);
                        }
                    }
                }

                try {
                    partition.read(row);
                } catch (Exception e) {
                    throw new IOException("getValue() failed: " + e.getMessage());
                }
            }

            @Override
            public boolean seekTo(BytesWritable key) throws IOException {
                boolean first = false, cur, firstset = false;
                for (int nx = 0; nx < cgScanners.length; nx++) {
                    if (cgScanners[nx] == null)
                        continue;
                    cur = cgScanners[nx].seekTo(key);
                    if (firstset) {
                        if (cur != first) {
                            throw new IOException("seekTo() failed: Column Groups are not evenly positioned.");
                        }
                    } else {
                        first = cur;
                        firstset = true;
                    }
                }
                return first;
            }

            @Override
            public void seekToEnd() throws IOException {
                for (int nx = 0; nx < cgScanners.length; nx++) {
                    if (cgScanners[nx] == null)
                        continue;
                    cgScanners[nx].seekToEnd();
                }
            }

            @Override
            public String getProjection() {
                return schema.toString();
            }

            @Override
            public Schema getSchema() {
                return schema.getSchema();
            }

            @Override
            public void close() throws IOException {
                if (sClosed)
                    return;
                sClosed = true;
                try {
                    for (int nx = 0; nx < cgScanners.length; nx++) {
                        if (cgScanners[nx] == null)
                            continue;
                        cgScanners[nx].close();
                        cgScanners[nx] = null;
                    }
                    if (closeReader) {
                        BasicTable.Reader.this.close();
                    }
                } finally {
                    for (int nx = 0; nx < cgScanners.length; nx++) {
                        if (cgScanners[nx] == null)
                            continue;
                        try {
                            cgScanners[nx].close();
                            cgScanners[nx] = null;
                        } catch (Exception e) {
                            // no-op
                        }
                    }
                    if (closeReader) {
                        try {
                            BasicTable.Reader.this.close();
                        } catch (Exception e) {
                            // no-op
                        }
                    }
                }
            }
        }
    }

    /**
     * BasicTable writer.
     */
    public static class Writer implements Closeable {
        private SchemaFile schemaFile;
        private MetaFile.Writer metaWriter;
        private boolean closed = true;
        ColumnGroup.Writer[] colGroups;
        Partition partition;
        boolean sorted;
        private boolean finished;
        Tuple[] cgTuples;
        private Path actualOutputPath;
        private Configuration writerConf;

        /**
         * Create a BasicTable writer. The semantics are as follows:
         * <ol>
         * <li>If path does not exist:
         * <ul>
         * <li>create the path directory, and initialize the directory for future
         * row insertion..
         * </ul>
         * <li>If path exists and the directory is empty: initialize the directory
         * for future row insertion.
         * <li>If path exists and contains what look like a complete BasicTable,
         * IOException will be thrown.
         * </ol>
         * This constructor never removes a valid/complete BasicTable.
         * 
         * @param path
         *          The path to the Basic Table, either not existent or must be a
         *          directory.
         * @param btSchemaString
         *          The schema of the Basic Table. For this version of
         *          implementation, the schema of a table is a comma or
         *          semicolon-separated list of column names, such as
         *          "FirstName, LastName; Sex, Department".
         * @param sortColumns
         *          String of comma-separated sorted columns: null for unsorted tables
         * @param comparator
         *          Name of the comparator used in sorted tables
         * @param conf
         *          Optional Configuration objects.
         * 
         * @throws IOException
         * @see Schema
         */
        public Writer(Path path, String btSchemaString, String btStorageString, String sortColumns,
                String comparator, Configuration conf) throws IOException {
            try {
                actualOutputPath = path;
                writerConf = conf;
                schemaFile = new SchemaFile(path, btSchemaString, btStorageString, sortColumns, comparator, conf);
                partition = schemaFile.getPartition();
                int numCGs = schemaFile.getNumOfPhysicalSchemas();
                colGroups = new ColumnGroup.Writer[numCGs];
                cgTuples = new Tuple[numCGs];
                sorted = schemaFile.isSorted();
                for (int nx = 0; nx < numCGs; nx++) {
                    colGroups[nx] = new ColumnGroup.Writer(new Path(path, schemaFile.getName(nx)),
                            schemaFile.getPhysicalSchema(nx), sorted, comparator, schemaFile.getName(nx),
                            schemaFile.getSerializer(nx), schemaFile.getCompressor(nx), schemaFile.getOwner(nx),
                            schemaFile.getGroup(nx), schemaFile.getPerm(nx), false, conf);
                    cgTuples[nx] = TypesUtils.createTuple(colGroups[nx].getSchema());
                }
                metaWriter = MetaFile.createWriter(new Path(path, BT_META_FILE), conf);
                partition.setSource(cgTuples);
                closed = false;
            } catch (Exception e) {
                throw new IOException("ColumnGroup.Writer constructor failed : " + e.getMessage());
            } finally {
                ;
                if (!closed)
                    return;
                if (metaWriter != null) {
                    try {
                        metaWriter.close();
                    } catch (Exception e) {
                        // no-op
                    }
                }
                if (colGroups != null) {
                    for (int i = 0; i < colGroups.length; ++i) {
                        if (colGroups[i] != null) {
                            try {
                                colGroups[i].close();
                            } catch (Exception e) {
                                // no-op
                            }
                        }
                    }
                }
            }
        }

        /**
         * a wrapper to support backward compatible constructor
         */
        public Writer(Path path, String btSchemaString, String btStorageString, Configuration conf)
                throws IOException {
            this(path, btSchemaString, btStorageString, null, null, conf);
        }

        /**
         * Reopen an already created BasicTable for writing. Exception will be
         * thrown if the table is already closed, or is in the process of being
         * closed.
         */
        public Writer(Path path, Configuration conf) throws IOException {
            try {
                actualOutputPath = path;
                writerConf = conf;

                if (ZebraConf.getOutputSchema(conf) != null) {
                    schemaFile = new SchemaFile(conf); // Read out schemaFile from conf, instead of from hdfs;
                } else { // This is only for io test cases and it cannot happen for m/r and pig cases; 
                    schemaFile = new SchemaFile(path, new String[0], conf); // fake an empty deleted cg list as no cg should have been deleted now
                }
                int numCGs = schemaFile.getNumOfPhysicalSchemas();
                partition = schemaFile.getPartition();
                sorted = schemaFile.isSorted();
                colGroups = new ColumnGroup.Writer[numCGs];
                cgTuples = new Tuple[numCGs];
                Path tmpWorkPath = new Path(path, "_temporary");
                for (int nx = 0; nx < numCGs; nx++) {
                    CGSchema cgschema = new CGSchema(schemaFile.getPhysicalSchema(nx), sorted,
                            schemaFile.getComparator(), schemaFile.getName(nx), schemaFile.getSerializer(nx),
                            schemaFile.getCompressor(nx), schemaFile.getOwner(nx), schemaFile.getGroup(nx),
                            schemaFile.getPerm(nx));

                    colGroups[nx] = new ColumnGroup.Writer(new Path(path, partition.getCGSchema(nx).getName()),
                            new Path(tmpWorkPath, partition.getCGSchema(nx).getName()), cgschema, conf);

                    cgTuples[nx] = TypesUtils.createTuple(colGroups[nx].getSchema());
                }
                partition.setSource(cgTuples);
                metaWriter = MetaFile.createWriter(new Path(path, BT_META_FILE), conf);
                closed = false;
            } catch (Exception e) {
                throw new IOException("ColumnGroup.Writer failed : " + e.getMessage());
            } finally {
                if (!closed)
                    return;
                if (metaWriter != null) {
                    try {
                        metaWriter.close();
                    } catch (Exception e) {
                        // no-op
                    }
                }
                if (colGroups != null) {
                    for (int i = 0; i < colGroups.length; ++i) {
                        if (colGroups[i] != null) {
                            try {
                                colGroups[i].close();
                            } catch (Exception e) {
                                // no-op
                            }
                        }
                    }
                }
            }
        }

        /**
         * Release resources used by the object. Unlike close(), finish() does not
         * make the table immutable.
         */
        public void finish() throws IOException {
            if (finished)
                return;
            finished = true;
            try {
                for (int nx = 0; nx < colGroups.length; nx++) {
                    if (colGroups[nx] != null) {
                        colGroups[nx].finish();
                    }
                }
                metaWriter.finish();
            } finally {
                try {
                    metaWriter.finish();
                } catch (Exception e) {
                    // no-op
                }
                for (int i = 0; i < colGroups.length; ++i) {
                    try {
                        colGroups[i].finish();
                    } catch (Exception e) {
                        // no-op
                    }
                }
            }
        }

        /**
         * Close the BasicTable for writing. No more inserters can be obtained after
         * close().
         */
        @Override
        public void close() throws IOException {
            cleanupTempDir();
            if (closed)
                return;
            closed = true;
            if (!finished)
                finish();
            try {
                ColumnGroup.CGIndex firstCGIndex = null, cgIndex;
                int first = -1;
                for (int nx = 0; nx < colGroups.length; nx++) {
                    if (colGroups[nx] != null) {
                        colGroups[nx].close();
                        if (first == -1) {
                            first = nx;
                            firstCGIndex = colGroups[nx].index;
                        } else {
                            cgIndex = colGroups[nx].index;
                            if (cgIndex.size() != firstCGIndex.size())
                                throw new IOException("Column Group " + colGroups[nx].path.getName()
                                        + " has different number of files than in column group "
                                        + colGroups[first].path.getName());
                            int size = firstCGIndex.size();
                            for (int i = 0; i < size; i++) {
                                if (!cgIndex.get(i).name.equals(firstCGIndex.get(i).name))
                                    throw new IOException("File[" + i + "] in Column Group "
                                            + colGroups[nx].path.getName() + " has a different name: "
                                            + cgIndex.get(i).name + " than " + firstCGIndex.get(i).name
                                            + " in column group " + colGroups[first].path.getName());
                                if (cgIndex.get(i).rows != firstCGIndex.get(i).rows)
                                    throw new IOException("File " + cgIndex.get(i).name + "Column Group "
                                            + colGroups[nx].path.getName() + " has a different number of rows, "
                                            + cgIndex.get(i).rows + ", than " + firstCGIndex.get(i).rows
                                            + " in column group " + colGroups[first].path.getName());
                            }
                        }
                    }
                }
                metaWriter.close();
            } finally {
                try {
                    metaWriter.close();
                } catch (Exception e) {
                    // no-op
                }
                for (int i = 0; i < colGroups.length; ++i) {
                    try {
                        colGroups[i].close();
                    } catch (Exception e) {
                        // no-op
                    }
                }
            }
        }

        /**
         * Removes the temporary directory underneath
         * $path/_temporary used to create intermediate data
         * during recrd writing
         */

        private void cleanupTempDir() throws IOException {
            FileSystem fileSys = actualOutputPath.getFileSystem(writerConf);
            Path pathToRemove = new Path(actualOutputPath, "_temporary");
            if (fileSys.exists(pathToRemove)) {
                if (!fileSys.delete(pathToRemove, true)) {
                    LOG.error("Failed to delete the temporary output" + " directory: " + pathToRemove.toString());
                }
            }
        }

        /**
         * Get the schema of the table.
         * 
         * @return the Schema object.
         */
        public Schema getSchema() {
            return schemaFile.getLogical();
        }

        /**
         * @return sortness
         */
        public boolean isSorted() {
            return sorted;
        }

        /**
         * Get the list of sorted columns.
         * @return the list of sorted columns
         */
        public SortInfo getSortInfo() {
            return schemaFile.getSortInfo();
        }

        /**
         * Get a inserter with a given name.
         * 
         * @param name
         *          the name of the inserter. If multiple calls to getInserter with
         *          the same name has been called, we expect they are the result of
         *          speculative execution and at most one of them will succeed.
         * @param finishWriter
         *          finish the underlying Writer object upon the close of the
         *          Inserter. Should be set to true if there is only one inserter
         *          operate on the table, so we should call finish() after the
         *          Inserter is closed.
         * 
         * @return A inserter object.
         * @throws IOException
         */
        public TableInserter getInserter(String name, boolean finishWriter) throws IOException {
            return this.getInserter(name, finishWriter, true);
        }

        /**
         * Get a inserter with a given name.
         * 
         * @param name
         *          the name of the inserter. If multiple calls to getInserter with
         *          the same name has been called, we expect they are the result of
         *          speculative execution and at most one of them will succeed.
         * @param finishWriter
         *          finish the underlying Writer object upon the close of the
         *          Inserter. Should be set to true if there is only one inserter
         *          operate on the table, so we should call finish() after the
         *          Inserter is closed.
         * @param checktype 
         *          whether or not do type check.
         * 
         * @return A inserter object.
         * @throws IOException
         */
        public TableInserter getInserter(String name, boolean finishWriter, boolean checkType) throws IOException {
            if (closed) {
                throw new IOException("BasicTable closed");
            }
            return new BTInserter(name, finishWriter, partition, checkType);
        }

        /**
         * Obtain an output stream for creating a Meta Block with the specific name.
         * This method can only be called after we insert all rows into the table.
         * All Meta Blocks must be created by a single process prior to closing the
         * table. No more inserter can be created after this call.
         * 
         * @param name
         *          The name of the Meta Block
         * @return The output stream. Close the stream to conclude the writing.
         * @throws IOException
         * @throws MetaBlockAlreadyExists
         */
        public DataOutputStream createMetaBlock(String name) throws MetaBlockAlreadyExists, IOException {
            return metaWriter.createMetaBlock(name);
        }

        private class BTInserter implements TableInserter {
            private TableInserter cgInserters[];
            private boolean sClosed = true;
            private boolean finishWriter;
            private Partition partition = null;

            BTInserter(String name, boolean finishWriter, Partition partition) throws IOException {
                this(name, finishWriter, partition, true);
            }

            BTInserter(String name, boolean finishWriter, Partition partition, boolean checkType)
                    throws IOException {
                try {
                    cgInserters = new ColumnGroup.Writer.CGInserter[colGroups.length];
                    for (int nx = 0; nx < colGroups.length; nx++) {
                        cgInserters[nx] = colGroups[nx].getInserter(name, false, checkType);
                    }
                    this.finishWriter = finishWriter;
                    this.partition = partition;
                    sClosed = false;
                } catch (Exception e) {
                    throw new IOException("BTInsert constructor failed :" + e.getMessage());
                } finally {
                    if (sClosed) {
                        if (cgInserters != null) {
                            for (int i = 0; i < cgInserters.length; ++i) {
                                if (cgInserters[i] != null) {
                                    try {
                                        cgInserters[i].close();
                                    } catch (Exception e) {
                                        // no-op
                                    }
                                }
                            }
                        }
                    }
                }
            }

            @Override
            public Schema getSchema() {
                return Writer.this.getSchema();
            }

            @Override
            public void insert(BytesWritable key, Tuple row) throws IOException {
                if (sClosed) {
                    throw new IOException("Inserter already closed");
                }

                // break the input row into sub-tuples, then insert them into the
                // corresponding CGs
                int curTotal = 0;
                try {
                    partition.insert(key, row);
                } catch (Exception e) {
                    throw new IOException("insert failed : " + e.getMessage());
                }
                for (int nx = 0; nx < colGroups.length; nx++) {
                    Tuple subTuple = cgTuples[nx];
                    int numCols = subTuple.size();
                    cgInserters[nx].insert(key, subTuple);
                    curTotal += numCols;
                }
            }

            @Override
            public void close() throws IOException {
                if (sClosed)
                    return;
                sClosed = true;
                try {
                    for (TableInserter ins : cgInserters) {
                        ins.close();
                    }
                    if (finishWriter) {
                        BasicTable.Writer.this.finish();
                    }
                } finally {
                    for (TableInserter ins : cgInserters) {
                        try {
                            ins.close();
                        } catch (Exception e) {
                            // no-op
                        }
                    }
                    if (finishWriter) {
                        try {
                            BasicTable.Writer.this.finish();
                        } catch (Exception e) {
                            // no-op
                        }
                    }
                }
            }
        }
    }

    /**
     * Drop a Basic Table, all files consisting of the BasicTable will be removed.
     * 
     * @param path
     *          the path to the Basic Table.
     * @param conf
     *          The configuration object.
     * @throws IOException
     */
    public static void drop(Path path, Configuration conf) throws IOException {
        FileSystem fs = path.getFileSystem(conf);
        fs.delete(path, true);
    }

    static class SchemaFile {
        private Version version;
        String comparator;
        Schema logical;
        Schema[] physical;
        Partition partition;
        boolean sorted;
        SortInfo sortInfo = null;
        String storage;
        CGSchema[] cgschemas;

        // Array indicating if a physical schema is already dropped
        // It is probably better to create "CGProperties" class and
        // store multiple properties like name there.
        boolean[] cgDeletedFlags;

        // ctor for reading
        public SchemaFile(Path path, String[] deletedCGs, Configuration conf) throws IOException {
            readSchemaFile(path, deletedCGs, conf);
        }

        // ctor for reading from a job configuration object; we do not need a table path; 
        // all information is held in the job configuration object.
        public SchemaFile(Configuration conf) throws IOException {
            String logicalStr = ZebraConf.getOutputSchema(conf);
            storage = ZebraConf.getOutputStorageHint(conf);
            String sortColumns = ZebraConf.getOutputSortColumns(conf) != null ? ZebraConf.getOutputSortColumns(conf)
                    : "";
            comparator = ZebraConf.getOutputComparator(conf) != null ? ZebraConf.getOutputComparator(conf) : "";

            version = SCHEMA_VERSION;

            try {
                logical = new Schema(logicalStr);
            } catch (Exception e) {
                throw new IOException("Schema build failed :" + e.getMessage());
            }

            try {
                partition = new Partition(logicalStr, storage, comparator, sortColumns);

            } catch (Exception e) {
                throw new IOException("Partition constructor failed :" + e.getMessage());
            }

            cgschemas = partition.getCGSchemas();
            physical = new Schema[cgschemas.length];
            //cgDeletedFlags = new boolean[physical.length];

            for (int nx = 0; nx < cgschemas.length; nx++) {
                physical[nx] = cgschemas[nx].getSchema();
            }

            this.sortInfo = partition.getSortInfo();
            this.sorted = partition.isSorted();
            this.comparator = (this.sortInfo == null ? null : this.sortInfo.getComparator());
            if (this.comparator == null)
                this.comparator = "";

            String[] sortColumnStr = sortColumns.split(",");
            if (sortColumnStr.length > 0) {
                sortInfo = SortInfo.parse(SortInfo.toSortString(sortColumnStr), logical, comparator);
            }
        }

        public Schema[] getPhysicalSchema() {
            return physical;
        }

        // ctor for writing
        public SchemaFile(Path path, String btSchemaStr, String btStorageStr, String sortColumns,
                String btComparator, Configuration conf) throws IOException {
            storage = btStorageStr;
            try {
                partition = new Partition(btSchemaStr, btStorageStr, btComparator, sortColumns);
            } catch (Exception e) {
                throw new IOException("Partition constructor failed :" + e.getMessage());
            }
            this.sortInfo = partition.getSortInfo();
            this.sorted = partition.isSorted();
            this.comparator = (this.sortInfo == null ? null : this.sortInfo.getComparator());
            if (this.comparator == null)
                this.comparator = "";
            logical = partition.getSchema();
            cgschemas = partition.getCGSchemas();
            physical = new Schema[cgschemas.length];
            for (int nx = 0; nx < cgschemas.length; nx++) {
                physical[nx] = cgschemas[nx].getSchema();
            }
            cgDeletedFlags = new boolean[physical.length];

            version = SCHEMA_VERSION;

            // write out the schema
            createSchemaFile(path, conf);
        }

        public String getComparator() {
            return comparator;
        }

        public Partition getPartition() {
            return partition;
        }

        public boolean isSorted() {
            return sorted;
        }

        public SortInfo getSortInfo() {
            return sortInfo;
        }

        public Schema getLogical() {
            return logical;
        }

        public int getNumOfPhysicalSchemas() {
            return physical.length;
        }

        public Schema getPhysicalSchema(int nx) {
            return physical[nx];
        }

        public String getName(int nx) {
            return cgschemas[nx].getName();
        }

        public String getSerializer(int nx) {
            return cgschemas[nx].getSerializer();
        }

        public String getCompressor(int nx) {
            return cgschemas[nx].getCompressor();
        }

        /**
         * Returns the index for CG with the given name. -1 indicates that there is
         * no CG with the name.
         */
        int getCGByName(String cgName) {
            for (int i = 0; i < physical.length; i++) {
                if (cgName.equals(getName(i))) {
                    return i;
                }
            }
            return -1;
        }

        /** Returns if the CG at the given index is delete */
        boolean isCGDeleted(int idx) {
            return cgDeletedFlags[idx];
        }

        public String getOwner(int nx) {
            return cgschemas[nx].getOwner();
        }

        public String getGroup(int nx) {
            return cgschemas[nx].getGroup();
        }

        public short getPerm(int nx) {
            return cgschemas[nx].getPerm();
        }

        /**
         * @return the string representation of the physical schema.
         */
        public String getBTSchemaString() {
            return logical.toString();
        }

        /**
         * @return the string representation of the storage hints
         */
        public String getStorageString() {
            return storage;
        }

        private void createSchemaFile(Path path, Configuration conf) throws IOException {
            // TODO: overwrite existing schema file, or need a flag?
            FSDataOutputStream outSchema = path.getFileSystem(conf).create(makeSchemaFilePath(path), true);
            version.write(outSchema);
            WritableUtils.writeString(outSchema, comparator);
            WritableUtils.writeString(outSchema, logical.toString());
            WritableUtils.writeString(outSchema, storage);
            WritableUtils.writeVInt(outSchema, physical.length);
            for (int nx = 0; nx < physical.length; nx++) {
                WritableUtils.writeString(outSchema, physical[nx].toString());
            }
            WritableUtils.writeVInt(outSchema, sorted ? 1 : 0);
            WritableUtils.writeVInt(outSchema, sortInfo == null ? 0 : sortInfo.size());
            if (sortInfo != null && sortInfo.size() > 0) {
                String[] sortedCols = sortInfo.getSortColumnNames();
                for (int i = 0; i < sortInfo.size(); i++) {
                    WritableUtils.writeString(outSchema, sortedCols[i]);
                }
            }
            outSchema.close();
        }

        private void readSchemaFile(Path path, String[] deletedCGs, Configuration conf) throws IOException {
            Path pathSchema = makeSchemaFilePath(path);
            if (!path.getFileSystem(conf).exists(pathSchema)) {
                throw new IOException("BT Schema file doesn't exist: " + pathSchema);
            }
            // read schema file
            FSDataInputStream in = path.getFileSystem(conf).open(pathSchema);
            version = new Version(in);
            // verify compatibility against SCHEMA_VERSION
            if (!version.compatibleWith(SCHEMA_VERSION)) {
                new IOException(
                        "Incompatible versions, expecting: " + SCHEMA_VERSION + "; found in file: " + version);
            }
            comparator = WritableUtils.readString(in);
            String logicalStr = WritableUtils.readString(in);
            try {
                logical = new Schema(logicalStr);
            } catch (Exception e) {
                ;
                throw new IOException("Schema build failed :" + e.getMessage());
            }
            storage = WritableUtils.readString(in);
            try {
                partition = new Partition(logicalStr, storage, comparator);
            } catch (Exception e) {
                throw new IOException("Partition constructor failed :" + e.getMessage());
            }
            cgschemas = partition.getCGSchemas();
            int numCGs = WritableUtils.readVInt(in);
            physical = new Schema[numCGs];
            cgDeletedFlags = new boolean[physical.length];
            TableSchemaParser parser;
            String cgschemastr;

            try {
                for (int nx = 0; nx < numCGs; nx++) {
                    cgschemastr = WritableUtils.readString(in);
                    parser = new TableSchemaParser(new StringReader(cgschemastr));
                    physical[nx] = parser.RecordSchema(null);
                }
            } catch (Exception e) {
                throw new IOException("parser.RecordSchema failed :" + e.getMessage());
            }

            sorted = WritableUtils.readVInt(in) == 1 ? true : false;
            if (deletedCGs == null)
                setCGDeletedFlags(path, conf);
            else {
                for (String deletedCG : deletedCGs) {
                    for (int i = 0; i < cgschemas.length; i++) {
                        if (cgschemas[i].getName().equals(deletedCG))
                            cgDeletedFlags[i] = true;
                    }
                }
            }

            if (version.compareTo(new Version((short) 1, (short) 0)) > 0) {
                int numSortColumns = WritableUtils.readVInt(in);
                if (numSortColumns > 0) {
                    String[] sortColumnStr = new String[numSortColumns];
                    for (int i = 0; i < numSortColumns; i++) {
                        sortColumnStr[i] = WritableUtils.readString(in);
                    }
                    sortInfo = SortInfo.parse(SortInfo.toSortString(sortColumnStr), logical, comparator);
                }
            }
            in.close();
        }

        private static int getNumCGs(Path path, Configuration conf) throws IOException {
            Path pathSchema = makeSchemaFilePath(path);
            if (!path.getFileSystem(conf).exists(pathSchema)) {
                throw new IOException("BT Schema file doesn't exist: " + pathSchema);
            }
            // read schema file
            FSDataInputStream in = path.getFileSystem(conf).open(pathSchema);
            Version version = new Version(in);
            // verify compatibility against SCHEMA_VERSION
            if (!version.compatibleWith(SCHEMA_VERSION)) {
                new IOException(
                        "Incompatible versions, expecting: " + SCHEMA_VERSION + "; found in file: " + version);
            }

            // read comparator
            WritableUtils.readString(in);
            // read logicalStr
            WritableUtils.readString(in);
            // read storage
            WritableUtils.readString(in);
            int numCGs = WritableUtils.readVInt(in);
            in.close();

            return numCGs;
        }

        private static Path makeSchemaFilePath(Path parent) {
            return new Path(parent, BT_SCHEMA_FILE);
        }

        /**
         * Sets cgDeletedFlags array by checking presense of
         * ".deleted-CGNAME" directory in the table top level
         * directory. 
         */
        void setCGDeletedFlags(Path path, Configuration conf) throws IOException {

            Set<String> deletedCGs = new HashSet<String>();

            for (FileStatus file : path.getFileSystem(conf).listStatus(path)) {
                if (!file.isDir()) {
                    String fname = file.getPath().getName();
                    if (fname.startsWith(DELETED_CG_PREFIX)) {
                        deletedCGs.add(fname.substring(DELETED_CG_PREFIX.length()));
                    }
                }
            }

            for (int i = 0; i < physical.length; i++) {
                cgDeletedFlags[i] = deletedCGs.contains(getName(i));
            }
        }

        String getDeletedCGs() {
            StringBuilder sb = new StringBuilder();
            // comma separated
            boolean first = true;
            for (int i = 0; i < physical.length; i++) {
                if (cgDeletedFlags[i]) {
                    if (first)
                        first = false;
                    else {
                        sb.append(DELETED_CG_SEPARATOR_PER_TABLE);
                    }
                    sb.append(getName(i));
                }
            }
            return sb.toString();
        }
    }

    static public void dumpInfo(String file, PrintStream out, Configuration conf) throws IOException {
        dumpInfo(file, out, conf, 0);
    }

    static public void dumpInfo(String file, PrintStream out, Configuration conf, int indent) throws IOException {
        IOutils.indent(out, indent);
        out.println("Basic Table : " + file);
        Path path = new Path(file);
        try {
            BasicTable.Reader reader = new BasicTable.Reader(path, conf);
            String schemaStr = reader.getBTSchemaString();
            String storageStr = reader.getStorageString();
            IOutils.indent(out, indent);
            out.printf("Schema : %s\n", schemaStr);
            IOutils.indent(out, indent);
            out.printf("Storage Information : %s\n", storageStr);
            SortInfo sortInfo = reader.getSortInfo();
            if (sortInfo != null && sortInfo.size() > 0) {
                IOutils.indent(out, indent);
                String[] sortedCols = sortInfo.getSortColumnNames();
                out.println("Sorted Columns :");
                for (int nx = 0; nx < sortedCols.length; nx++) {
                    if (nx > 0)
                        out.printf(" , ");
                    out.printf("%s", sortedCols[nx]);
                }
                out.printf("\n");
            }
            IOutils.indent(out, indent);
            out.println("Column Groups within the Basic Table :");
            for (int nx = 0; nx < reader.colGroups.length; nx++) {
                IOutils.indent(out, indent);
                out.printf("\nColumn Group [%d] :", nx);
                if (reader.colGroups[nx] != null) {
                    ColumnGroup.dumpInfo(reader.colGroups[nx].path, out, conf, indent);
                } else {
                    // print basic info for deleted column groups.
                    out.printf("\nColum Group : DELETED");
                    out.printf("\nName : %s", reader.schemaFile.getName(nx));
                    out.printf("\nSchema : %s\n", reader.schemaFile.cgschemas[nx].getSchema().toString());
                }
            }
        } catch (Exception e) {
            throw new IOException("BasicTable.Reader failed : " + e.getMessage());
        } finally {
            // no-op
        }
    }

    public static void main(String[] args) {
        System.out.printf("BasicTable Dumper\n");
        if (args.length == 0) {
            System.out.println("Usage: java ... org.apache.hadoop.zebra.io.BasicTable path [path ...]");
            System.exit(0);
        }
        Configuration conf = new Configuration();
        for (String file : args) {
            try {
                dumpInfo(file, System.out, conf);
            } catch (IOException e) {
                e.printStackTrace(System.err);
            }
        }
    }
}