Java tutorial
/* * Copyright 2009-2014 DigitalGlobe, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and limitations under the License. */ package org.mrgeo.data.accumulo.partitioners; import org.apache.accumulo.core.client.mapreduce.lib.partition.RangePartitioner; import org.apache.commons.codec.binary.Base64; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.Partitioner; import org.mrgeo.data.tile.TileIdWritable; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.net.URI; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.Scanner; import java.util.TreeSet; /** * This class is an adaptation of the AccumuloRangePartitioner. * The change here is associated with the keys and values that are used * for determining the correct partition. */ public class AccumuloMrGeoRangePartitioner extends Partitioner<TileIdWritable, Writable> implements Configurable { private static final String PREFIX = RangePartitioner.class.getName(); private static final String CUTFILE_KEY = PREFIX + ".cutFile"; private static final String NUM_SUBBINS = PREFIX + ".subBins"; private Configuration conf; /** * */ public int getPartition(TileIdWritable key, Writable value, int numPartitions) { try { return findPartition(key, getCutPoints(), getNumSubBins()); } catch (IOException e) { throw new RuntimeException(e); } } // end getPartition /** * * @param key * @param array * @param numSubBins * @return */ int findPartition(TileIdWritable key, TileIdWritable[] array, int numSubBins) { // find the bin for the range, and guarantee it is positive int index = Arrays.binarySearch(array, key); index = index < 0 ? (index + 1) * -1 : index; // both conditions work with numSubBins == 1, but this check is to avoid // hashing, when we don't need to, for speed if (numSubBins < 2) return index; return (key.toString().hashCode() & Integer.MAX_VALUE) % numSubBins + index * numSubBins; } private int _numSubBins = 0; private synchronized int getNumSubBins() { if (_numSubBins < 1) { // get number of sub-bins and guarantee it is positive _numSubBins = Math.max(1, getConf().getInt(NUM_SUBBINS, 1)); } return _numSubBins; } private Text cutPointArray[] = null; private TileIdWritable tileIdPointArray[] = null; private synchronized TileIdWritable[] getCutPoints() throws IOException { if (cutPointArray == null) { String cutFileName = conf.get(CUTFILE_KEY); Path[] cf = DistributedCache.getLocalCacheFiles(conf); if (cf != null) { for (Path path : cf) { if (path.toUri().getPath().endsWith(cutFileName.substring(cutFileName.lastIndexOf('/')))) { TreeSet<Text> cutPoints = new TreeSet<Text>(); Scanner in = new Scanner(new BufferedReader(new FileReader(path.toString()))); try { while (in.hasNextLine()) cutPoints.add(new Text(Base64.decodeBase64(in.nextLine().getBytes()))); } finally { in.close(); } cutPointArray = cutPoints.toArray(new Text[cutPoints.size()]); break; } } } if (cutPointArray == null) throw new FileNotFoundException(cutFileName + " not found in distributed cache"); } tileIdPointArray = new TileIdWritable[cutPointArray.length]; for (int x = 0; x < cutPointArray.length; x++) { byte[] b = cutPointArray[x].getBytes(); ByteBuffer buffer = ByteBuffer.wrap(b); long k = buffer.getLong(); tileIdPointArray[x] = new TileIdWritable(k); } return tileIdPointArray; } // end getCutPoints public Configuration getConf() { return conf; } public void setConf(Configuration conf) { this.conf = conf; } /** * Sets the hdfs file name to use, containing a newline separated list of Base64 encoded split points that represent ranges for partitioning */ public static void setSplitFile(JobContext job, String file) { URI uri = new Path(file).toUri(); DistributedCache.addCacheFile(uri, job.getConfiguration()); job.getConfiguration().set(CUTFILE_KEY, uri.getPath()); } /** * Sets the number of random sub-bins per range */ public static void setNumSubBins(JobContext job, int num) { job.getConfiguration().setInt(NUM_SUBBINS, num); } } // end AccumuloRangePartitioner