org.apache.pig.piggybank.storage.partition.PathPartitioner.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pig.piggybank.storage.partition.PathPartitioner.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.pig.piggybank.storage.partition;

import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

/**
 *
 * Its convenient sometimes to partition logs by date values or other e.g.
 * country, city etc.<br/>
 * A daydate partitioned hdfs directory might look something like:<br/>
 *
 * <pre>
 * /logs/repo/mylog/
 *                daydate=2010-01-01
 *                 daydate=2010-01-02
 * </pre>
 *
 * This class accepts a path like /logs/repo/mylog and return a map of the
 * partition keys
 */
public class PathPartitioner {

    /**
     * Note: this must be the path lowes in the Searches for the key=value pairs
     * in the path pointer by the location parameter.
     *
     * @param location
     *            String root path in hdsf e.g. /user/hive/warehouse or
     *            /logs/repo
     * @param conf
     *            Configuration
     * @return Set of String. The order is maintained as per the directory tree.
     *         i.e. if /logs/repo/year=2010/month=2010 exists the first item in
     *         the set will be year and the second month.
     * @throws IOException
     */
    public Map<String, String> getPathPartitionKeyValues(String location) throws IOException {

        // use LinkedHashSet because order is important here.
        Map<String, String> partitionKeys = new LinkedHashMap<String, String>();

        String[] pathSplit = location.split("/");

        for (String pathSplitItem : pathSplit) {
            parseAndPutKeyValue(pathSplitItem, partitionKeys);
        }

        return partitionKeys;
    }

    /**
     * Searches for the key=value pairs in the path pointer by the location
     * parameter.
     *
     * @param location
     *            String root path in hdsf e.g. /user/hive/warehouse or
     *            /logs/repo
     * @param conf
     *            Configuration
     * @return Set of String. The order is maintained as per the directory tree.
     *         i.e. if /logs/repo/year=2010/month=2010 exists the first item in
     *         the set will be year and the second month.
     * @throws IOException
     */
    public Set<String> getPartitionKeys(String location, Configuration conf) throws IOException {

        // find the hive type partition key=value pairs from the path.
        // first parse the string alone.
        Path path = new Path(location);
        FileSystem fs = path.getFileSystem(conf);

        FileStatus[] fileStatusArr = null;

        // use LinkedHashSet because order is important here.
        Set<String> partitionKeys = new LinkedHashSet<String>();

        parseAndPutKeyValue(location, partitionKeys);

        while (!((fileStatusArr = fs.listStatus(path)) == null || fs.isFile(path) || fileStatusArr.length == 0)) {
            for (FileStatus fileStatus : fileStatusArr) {

                path = fileStatus.getPath();

                // ignore hidden directories
                if (fileStatus.getPath().getName().startsWith("_") || !fileStatus.isDir())
                    continue;

                parseAndPutKeyValue(path.getName(), partitionKeys);
                // at the first directory found stop the for loop after parsing
                // for key value pairs
                break;
            }

        }

        return partitionKeys;
    }

    private final void parseAndPutKeyValue(String pathName, Map<String, String> partitionKeys) {
        String[] keyValue = parsePathKeyValue(pathName);
        if (keyValue != null) {
            partitionKeys.put(keyValue[0], keyValue[1]);
        }

    }

    private final void parseAndPutKeyValue(String pathName, Set<String> partitionKeys) {
        String[] keyValue = parsePathKeyValue(pathName);
        if (keyValue != null) {
            partitionKeys.add(keyValue[0]);
        }

    }

    /**
     * Will look for key=value pairs in the path for example:
     * /user/hive/warehouse/mylogs/year=2010/month=07
     *
     * @param path
     * @return String[] [0]= key [1] = value
     */
    public String[] parsePathKeyValue(String path) {
        int slashIndex = path.lastIndexOf('/');
        String parsedPath = path;
        String[] keyValue = null;

        if (slashIndex > 0) {
            parsedPath = path.substring(slashIndex);
        }

        if (parsedPath.contains("=")) {
            String split[] = parsedPath.split("=");
            if (split.length == 2) {
                keyValue = split;
            }
        }

        return keyValue;
    }

}