org.apache.accumulo.hadoop.mapreduce.partition.RangePartitioner.java Source code

Introduction

Here is the source code for org.apache.accumulo.hadoop.mapreduce.partition.RangePartitioner.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.accumulo.hadoop.mapreduce.partition;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.Arrays;
import java.util.Base64;
import java.util.Scanner;
import java.util.TreeSet;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Partitioner;

import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;

/**
 * Hadoop partitioner that uses ranges, and optionally sub-bins based on hashing.
 *
 * @since 2.0.0
 */
public class RangePartitioner extends Partitioner<Text, Writable> implements Configurable {
    private static final String PREFIX = RangePartitioner.class.getName();
    private static final String CUTFILE_KEY = PREFIX + ".cutFile";
    private static final String NUM_SUBBINS = PREFIX + ".subBins";

    private Configuration conf;

    @Override
    public int getPartition(Text key, Writable value, int numPartitions) {
        try {
            return findPartition(key, getCutPoints(), getNumSubBins());
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    int findPartition(Text key, Text[] array, int numSubBins) {
        // find the bin for the range, and guarantee it is positive
        int index = Arrays.binarySearch(array, key);
        index = index < 0 ? (index + 1) * -1 : index;

        // both conditions work with numSubBins == 1, but this check is to avoid
        // hashing, when we don't need to, for speed
        if (numSubBins < 2)
            return index;
        return (key.toString().hashCode() & Integer.MAX_VALUE) % numSubBins + index * numSubBins;
    }

    private int _numSubBins = 0;

    private synchronized int getNumSubBins() {
        if (_numSubBins < 1) {
            // get number of sub-bins and guarantee it is positive
            _numSubBins = Math.max(1, getConf().getInt(NUM_SUBBINS, 1));
        }
        return _numSubBins;
    }

    private Text[] cutPointArray = null;

    @SuppressFBWarnings(value = "PATH_TRAVERSAL_IN", justification = "path provided by distributed cache framework, not user input")
    private synchronized Text[] getCutPoints() throws IOException {
        if (cutPointArray == null) {
            String cutFileName = conf.get(CUTFILE_KEY);
            Path[] cf = Job.getInstance().getLocalCacheFiles();

            if (cf != null) {
                for (Path path : cf) {
                    if (path.toUri().getPath().endsWith(cutFileName.substring(cutFileName.lastIndexOf('/')))) {
                        TreeSet<Text> cutPoints = new TreeSet<>();
                        try (Scanner in = new Scanner(new BufferedReader(
                                new InputStreamReader(new FileInputStream(path.toString()), UTF_8)))) {
                            while (in.hasNextLine())
                                cutPoints.add(new Text(Base64.getDecoder().decode(in.nextLine())));
                        }
                        cutPointArray = cutPoints.toArray(new Text[cutPoints.size()]);
                        break;
                    }
                }
            }
            if (cutPointArray == null)
                throw new FileNotFoundException(cutFileName + " not found in distributed cache");
        }
        return cutPointArray;
    }

    @Override
    public Configuration getConf() {
        return conf;
    }

    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    /**
     * Sets the hdfs file name to use, containing a newline separated list of Base64 encoded split
     * points that represent ranges for partitioning
     */
    public static void setSplitFile(Job job, String file) {
        URI uri = new Path(file).toUri();
        job.addCacheFile(uri);
        job.getConfiguration().set(CUTFILE_KEY, uri.getPath());
    }

    /**
     * Sets the number of random sub-bins per range
     */
    public static void setNumSubBins(Job job, int num) {
        job.getConfiguration().setInt(NUM_SUBBINS, num);
    }
}