cascading.hcatalog.CascadingHCatUtil.java Source code

Java tutorial

Introduction

Here is the source code for cascading.hcatalog.CascadingHCatUtil.java

Source

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.hcatalog;

import cascading.cascade.CascadeException;
import com.google.common.base.Preconditions;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hive.hcatalog.common.HCatException;
import org.apache.hive.hcatalog.common.HCatUtil;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import static com.google.common.collect.Lists.newArrayList;

/**
 * 
 * @author txiao
 * 
 */
public class CascadingHCatUtil {
    private static final Logger LOG = LoggerFactory.getLogger(CascadingHCatUtil.class);

    /**
     * 
     * @param db
     * @param table
     * @param filter
     * @param jobConf
     * @return A list of locations
     */
    public static List<String> getDataStorageLocation(String db, String table, String filter, JobConf jobConf) {
        Preconditions.checkNotNull(table, "Table name must not be null");

        HiveMetaStoreClient client = null;
        List<String> locations = new ArrayList<String>();

        try {
            client = getHiveMetaStoreClient(jobConf);
            Table hiveTable = HCatUtil.getTable(client, db, table);

            if (hiveTable.isPartitioned()) {
                List<Partition> parts = null;
                if (null != StringUtils.stripToNull(filter)) {
                    parts = client.listPartitionsByFilter(db, table, filter, (short) -1);
                } else {
                    parts = client.listPartitions(db, table, (short) -1);
                }

                if (parts.size() > 0) {
                    // Return more than one partitions when filter is
                    // something
                    // like ds >= 1234
                    for (Partition part : parts) {
                        locations.addAll(getFilesInHivePartition(part, jobConf));
                    }
                } else {
                    logError(
                            "Table " + hiveTable.getTableName() + " doesn't have the specified partition:" + filter,
                            null);
                }

            } else {
                locations.add(hiveTable.getTTable().getSd().getLocation());
            }
        } catch (IOException e) {
            logError("Error occured when getting hiveconf", e);
        } catch (MetaException e) {
            logError("Error occured when getting HiveMetaStoreClient", e);
        } catch (NoSuchObjectException e) {
            logError("Table doesn't exist in HCatalog: " + table, e);
        } catch (TException e) {
            logError("Error occured when getting Table", e);
        } finally {
            HCatUtil.closeHiveClientQuietly(client);
        }

        return locations;
    }

    protected static List<String> getFilesInHivePartition(Partition part, JobConf jobConf) {
        List<String> result = newArrayList();

        String ignoreFileRegex = jobConf.get(HCatTap.IGNORE_FILE_IN_PARTITION_REGEX, "");
        Pattern ignoreFilePattern = Pattern.compile(ignoreFileRegex);

        try {
            Path partitionDirPath = new Path(part.getSd().getLocation());
            FileStatus[] partitionContent = partitionDirPath.getFileSystem(jobConf).listStatus(partitionDirPath);
            for (FileStatus currStatus : partitionContent) {
                if (!currStatus.isDir()) {
                    if (!ignoreFilePattern.matcher(currStatus.getPath().getName()).matches()) {
                        result.add(currStatus.getPath().toUri().getPath());
                    } else {
                        LOG.debug("Ignoring path {} since matches ignore regex {}",
                                currStatus.getPath().toUri().getPath(), ignoreFileRegex);
                    }
                }
            }

        } catch (IOException e) {
            logError("Unable to read the content of partition '" + part.getSd().getLocation() + "'", e);
        }

        return result;
    }

    /**
     * 
     * @param db
     * @param table
     * @param filter
     * @param path
     * @param jobConf
     * @return 
     */
    public static boolean setDataStorageLocation(String db, String table, String filter, String path,
            JobConf jobConf) {
        Preconditions.checkNotNull(table, "Table name must not be null");

        HiveMetaStoreClient client = null;
        List<String> locations = new ArrayList<String>();

        try {
            client = getHiveMetaStoreClient(jobConf);

            Table hiveTable = HCatUtil.getTable(client, db, table);
            hiveTable.setDataLocation(new Path(path));

            client.alter_table(db, table, hiveTable.getTTable());
        } catch (IOException e) {
            logError("Error occured when getting hiveconf", e);
        } catch (MetaException e) {
            logError("Error occured when getting HiveMetaStoreClient", e);
        } catch (NoSuchObjectException e) {
            logError("Table doesn't exist in HCatalog: " + table, e);
        } catch (TException e) {
            logError("Error occured when getting Table", e);
        } finally {
            HCatUtil.closeHiveClientQuietly(client);
        }

        return true;
    }

    public static Table getHiveTable(String db, String table, JobConf conf) {
        HiveMetaStoreClient client = null;
        Table hiveTable = null;

        try {
            client = getHiveMetaStoreClient(conf);
            hiveTable = HCatUtil.getTable(client, db, table);
        } catch (IOException e) {
            logError("Error occured when getting hiveconf", e);
        } catch (MetaException e) {
            logError("Error occured when getting HiveMetaStoreClient", e);
        } catch (NoSuchObjectException e) {
            logError("Table doesn't exist in HCatalog: " + table, e);
        } catch (TException e) {
            logError("Error occured when getting Table", e);
        } finally {
            HCatUtil.closeHiveClientQuietly(client);
        }

        return hiveTable;
    }

    private static HiveMetaStoreClient getHiveMetaStoreClient(JobConf jobConf) throws IOException, MetaException {
        HiveConf hiveConf = HCatUtil.getHiveConf(jobConf);

        return HCatUtil.getHiveClient(hiveConf);
    }

    /**
     * Build {@link org.apache.hive.hcatalog.data.schema.HCatSchema} of table
     * from a list of {@link org.apache.hadoop.hive.metastore.api.FieldSchema}
     * 
     * @param columns
     * @return
     */
    public static HCatSchema buildHCatSchema(List<FieldSchema> columns) {
        HCatSchema schema = null;

        try {
            schema = new HCatSchema(HCatUtil.getHCatFieldSchemaList(columns));
        } catch (HCatException e) {
            logError("Error occured when building table schema", e);
        }

        return schema;
    }

    private static void logError(String message, Exception e) {
        LOG.error(message, e);
        throw new CascadeException(e);
    }

    /**
     * Assign HCatalog default db value to db if it is null
     * 
     * @param db
     * @return
     */
    public static String hcatDefaultDBIfNull(String db) {
        return db == null ? MetaStoreUtils.DEFAULT_DATABASE_NAME : db;
    }
}