com.cloudera.spark.bulkload.TotalOrderPartitioner.java Source code

Introduction

Here is the source code for com.cloudera.spark.bulkload.TotalOrderPartitioner.java
Source

  /**
   * Licensed to the Apache Software Foundation (ASF) under one
   * or more contributor license agreements.  See the NOTICE file
   * distributed with this work for additional information
   * regarding copyright ownership.  The ASF licenses this file
   * to you under the Apache License, Version 2.0 (the
   * "License"); you may not use this file except in compliance
   * with the License.  You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */

  package com.cloudera.spark.bulkload;

  import java.io.IOException;
  import java.lang.reflect.Array;
  import java.util.ArrayList;
  import java.util.Arrays;

  import org.apache.commons.logging.Log;
  import org.apache.commons.logging.LogFactory;
  import org.apache.hadoop.classification.InterfaceAudience;
  import org.apache.hadoop.classification.InterfaceStability;
  import org.apache.hadoop.conf.Configurable;
  import org.apache.hadoop.conf.Configuration;
  import org.apache.hadoop.fs.FileSystem;
  import org.apache.hadoop.fs.Path;
  import org.apache.hadoop.io.BinaryComparable;
  import org.apache.hadoop.io.IOUtils;
  import org.apache.hadoop.io.NullWritable;
  import org.apache.hadoop.io.SequenceFile;
  import org.apache.hadoop.io.RawComparator;
  import org.apache.hadoop.io.WritableComparable;
  import org.apache.hadoop.mapreduce.Job;
  import org.apache.hadoop.mapreduce.Partitioner;
  import org.apache.hadoop.util.ReflectionUtils;
  import org.apache.spark.sql.SQLContext;

  /**
   * Partitioner effecting a total order by reading split points from
   * an externally generated source.
   */
  @InterfaceAudience.Public
  @InterfaceStability.Stable
  public class TotalOrderPartitioner<K extends WritableComparable<?>, V> extends Partitioner<K, V>
          implements Configurable {

      private Node partitions;
      public static final String DEFAULT_PATH = "_partition.lst";
      public static final String PARTITIONER_PATH = "mapreduce.totalorderpartitioner.path";
      public static final String MAX_TRIE_DEPTH = "mapreduce.totalorderpartitioner.trie.maxdepth";
      public static final String NATURAL_ORDER = "mapreduce.totalorderpartitioner.naturalorder";
      Configuration conf;
      private static final Log LOG = LogFactory.getLog(TotalOrderPartitioner.class);

      public TotalOrderPartitioner() {
      }

      /**
       * Read in the partition file and build indexing data structures.
       * If the keytype is {@link BinaryComparable} and
       * <tt>total.order.partitioner.natural.order</tt> is not false, a trie
       * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
       * will be built. Otherwise, keys will be located using a binary search of
       * the partition keyset using the {@link RawComparator}
       * defined for this job. The input file must be sorted with the same
       * comparator and contain {@link Job#getNumReduceTasks()} - 1 keys.
       */
      @SuppressWarnings("unchecked") // keytype from conf not static
      public void setConf(Configuration conf) {
          try {
              this.conf = conf;
              String parts = getPartitionFile(conf);
              final Path partFile = new Path(parts);
              final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(conf) // assume in DistributedCache
                      : partFile.getFileSystem(conf);

              Job job = new Job(conf);
              Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass();
              K[] splitPoints = readPartitions(fs, partFile, keyClass, conf);
              if (splitPoints.length != job.getNumReduceTasks() - 1) {
                  throw new IOException("Wrong number of partitions in keyset");
              }
              RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
              for (int i = 0; i < splitPoints.length - 1; ++i) {
                  if (comparator.compare(splitPoints[i], splitPoints[i + 1]) >= 0) {
                      throw new IOException("Split points are out of order");
                  }
              }
              boolean natOrder = conf.getBoolean(NATURAL_ORDER, true);
              if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) {
                  partitions = buildTrie((BinaryComparable[]) splitPoints, 0, splitPoints.length, new byte[0],
                          // Now that blocks of identical splitless trie nodes are 
                          // represented reentrantly, and we develop a leaf for any trie
                          // node with only one split point, the only reason for a depth
                          // limit is to refute stack overflow or bloat in the pathological
                          // case where the split points are long and mostly look like bytes 
                          // iii...iixii...iii   .  Therefore, we make the default depth
                          // limit large but not huge.
                          conf.getInt(MAX_TRIE_DEPTH, 200));
              } else {
                  partitions = new BinarySearchNode(splitPoints, comparator);
              }
          } catch (IOException e) {
              throw new IllegalArgumentException("Can't read partitions file", e);
          }
      }

      public Configuration getConf() {
          return conf;
      }

      // by construction, we know if our keytype
      @SuppressWarnings("unchecked") // is memcmp-able and uses the trie
      public int getPartition(K key, V value, int numPartitions) {
          return partitions.findPartition(key);
      }

      /**
       * Set the path to the SequenceFile storing the sorted partition keyset.
       * It must be the case that for <tt>R</tt> reduces, there are <tt>R-1</tt>
       * keys in the SequenceFile.
       */
      public static void setPartitionFile(Configuration conf, Path p) {
          conf.set(PARTITIONER_PATH, p.toString());
      }

      /**
       * Get the path to the SequenceFile storing the sorted partition keyset.
       * @see #setPartitionFile(Configuration, Path)
       */
      public static String getPartitionFile(Configuration conf) {
          return conf.get(PARTITIONER_PATH, DEFAULT_PATH);
      }

      /**
       * Interface to the partitioner to locate a key in the partition keyset.
       */
      interface Node<T> {
          /**
           * Locate partition in keyset K, st [Ki..Ki+1) defines a partition,
           * with implicit K0 = -inf, Kn = +inf, and |K| = #partitions - 1.
           */
          int findPartition(T key);
      }

      /**
       * Base class for trie nodes. If the keytype is memcomp-able, this builds
       * tries of the first <tt>total.order.partitioner.max.trie.depth</tt>
       * bytes.
       */
      static abstract class TrieNode implements Node<BinaryComparable> {
          private final int level;

          TrieNode(int level) {
              this.level = level;
          }

          int getLevel() {
              return level;
          }
      }

      /**
       * For types that are not {@link BinaryComparable} or
       * where disabled by <tt>total.order.partitioner.natural.order</tt>,
       * search the partition keyset with a binary search.
       */
      class BinarySearchNode implements Node<K> {
          private final K[] splitPoints;
          private final RawComparator<K> comparator;

          BinarySearchNode(K[] splitPoints, RawComparator<K> comparator) {
              this.splitPoints = splitPoints;
              this.comparator = comparator;
          }

          public int findPartition(K key) {
              final int pos = Arrays.binarySearch(splitPoints, key, comparator) + 1;
              return (pos < 0) ? -pos : pos;
          }
      }

      /**
       * An inner trie node that contains 256 children based on the next
       * character.
       */
      class InnerTrieNode extends TrieNode {
          private TrieNode[] child = new TrieNode[256];

          InnerTrieNode(int level) {
              super(level);
          }

          public int findPartition(BinaryComparable key) {
              int level = getLevel();
              if (key.getLength() <= level) {
                  return child[0].findPartition(key);
              }
              return child[0xFF & key.getBytes()[level]].findPartition(key);
          }
      }

      /**
       * @param level        the tree depth at this node
       * @param splitPoints  the full split point vector, which holds
       *                     the split point or points this leaf node
       *                     should contain
       * @param lower        first INcluded element of splitPoints
       * @param upper        first EXcluded element of splitPoints
       * @return  a leaf node.  They come in three kinds: no split points 
       *          [and the findParttion returns a canned index], one split
       *          point [and we compare with a single comparand], or more
       *          than one [and we do a binary search].  The last case is
       *          rare.
       */
      private TrieNode LeafTrieNodeFactory(int level, BinaryComparable[] splitPoints, int lower, int upper) {
          switch (upper - lower) {
          case 0:
              return new UnsplitTrieNode(level, lower);

          case 1:
              return new SinglySplitTrieNode(level, splitPoints, lower);

          default:
              return new LeafTrieNode(level, splitPoints, lower, upper);
          }
      }

      /**
       * A leaf trie node that scans for the key between lower..upper.
       * 
       * We don't generate many of these now, since we usually continue trie-ing 
       * when more than one split point remains at this level. and we make different
       * objects for nodes with 0 or 1 split point.
       */
      private class LeafTrieNode extends TrieNode {
          final int lower;
          final int upper;
          final BinaryComparable[] splitPoints;

          LeafTrieNode(int level, BinaryComparable[] splitPoints, int lower, int upper) {
              super(level);
              this.lower = lower;
              this.upper = upper;
              this.splitPoints = splitPoints;
          }

          public int findPartition(BinaryComparable key) {
              final int pos = Arrays.binarySearch(splitPoints, lower, upper, key) + 1;
              return (pos < 0) ? -pos : pos;
          }
      }

      private class UnsplitTrieNode extends TrieNode {
          final int result;

          UnsplitTrieNode(int level, int value) {
              super(level);
              this.result = value;
          }

          public int findPartition(BinaryComparable key) {
              return result;
          }
      }

      private class SinglySplitTrieNode extends TrieNode {
          final int lower;
          final BinaryComparable mySplitPoint;

          SinglySplitTrieNode(int level, BinaryComparable[] splitPoints, int lower) {
              super(level);
              this.lower = lower;
              this.mySplitPoint = splitPoints[lower];
          }

          public int findPartition(BinaryComparable key) {
              return lower + (key.compareTo(mySplitPoint) < 0 ? 0 : 1);
          }
      }

/**
 * Read the cut points from the given IFile.
 * @param fs The file system
 * @param p The path to read
 * @param keyClass The map output key class
 * @param job The job config
 * @throws IOException
 */
                               // matching key types enforced by passing in
@SuppressWarnings("unchecked") // map output key class
private K[] readPartitions(FileSystem fs, Path p, Class<K> keyClass,
    Configuration conf) throws IOException {
  SequenceFile.Reader reader = new SequenceFile.Reader(fs, p, conf);
  ArrayList<K> parts = new ArrayList<K>();
  K key = ReflectionUtils.newInstance(keyClass, conf);
  NullWritable value = NullWritable.get();
  try {
    while (reader.next(key, value)) {
      parts.add(key);
      key = ReflectionUtils.newInstance(keyClass, conf);
    }
    reader.close();
    reader = null;
  } finally {
    IOUtils.cleanup(LOG, reader);
  }
  SQLContext
  return parts.toArray((K[])Array.newInstance(keyClass, parts.size()));
}

      /**
       * 
       * This object contains a TrieNodeRef if there is such a thing that
       * can be repeated.  Two adjacent trie node slots that contain no 
       * split points can be filled with the same trie node, even if they
       * are not on the same level.  See buildTreeRec, below.
       *
       */
      private class CarriedTrieNodeRef {
          TrieNode content;

          CarriedTrieNodeRef() {
              content = null;
          }
      }

      /**
       * Given a sorted set of cut points, build a trie that will find the correct
       * partition quickly.
       * @param splits the list of cut points
       * @param lower the lower bound of partitions 0..numPartitions-1
       * @param upper the upper bound of partitions 0..numPartitions-1
       * @param prefix the prefix that we have already checked against
       * @param maxDepth the maximum depth we will build a trie for
       * @return the trie node that will divide the splits correctly
       */
      private TrieNode buildTrie(BinaryComparable[] splits, int lower, int upper, byte[] prefix, int maxDepth) {
          return buildTrieRec(splits, lower, upper, prefix, maxDepth, new CarriedTrieNodeRef());
      }

      /**
       * This is the core of buildTrie.  The interface, and stub, above, just adds
       * an empty CarriedTrieNodeRef.  
       * 
       * We build trie nodes in depth first order, which is also in key space
       * order.  Every leaf node is referenced as a slot in a parent internal
       * node.  If two adjacent slots [in the DFO] hold leaf nodes that have
       * no split point, then they are not separated by a split point either, 
       * because there's no place in key space for that split point to exist.
       * 
       * When that happens, the leaf nodes would be semantically identical, and
       * we reuse the object.  A single CarriedTrieNodeRef "ref" lives for the 
       * duration of the tree-walk.  ref carries a potentially reusable, unsplit
       * leaf node for such reuse until a leaf node with a split arises, which 
       * breaks the chain until we need to make a new unsplit leaf node.
       * 
       * Note that this use of CarriedTrieNodeRef means that for internal nodes, 
       * for internal nodes if this code is modified in any way we still need 
       * to make or fill in the subnodes in key space order.
       */
      private TrieNode buildTrieRec(BinaryComparable[] splits, int lower, int upper, byte[] prefix, int maxDepth,
              CarriedTrieNodeRef ref) {
          final int depth = prefix.length;
          // We generate leaves for a single split point as well as for 
          // no split points.
          if (depth >= maxDepth || lower >= upper - 1) {
              // If we have two consecutive requests for an unsplit trie node, we
              // can deliver the same one the second time.
              if (lower == upper && ref.content != null) {
                  return ref.content;
              }
              TrieNode result = LeafTrieNodeFactory(depth, splits, lower, upper);
              ref.content = lower == upper ? result : null;
              return result;
          }
          InnerTrieNode result = new InnerTrieNode(depth);
          byte[] trial = Arrays.copyOf(prefix, prefix.length + 1);
          // append an extra byte on to the prefix
          int currentBound = lower;
          for (int ch = 0; ch < 0xFF; ++ch) {
              trial[depth] = (byte) (ch + 1);
              lower = currentBound;
              while (currentBound < upper) {
                  if (splits[currentBound].compareTo(trial, 0, trial.length) >= 0) {
                      break;
                  }
                  currentBound += 1;
              }
              trial[depth] = (byte) ch;
              result.child[0xFF & ch] = buildTrieRec(splits, lower, currentBound, trial, maxDepth, ref);
          }
          // pick up the rest
          trial[depth] = (byte) 0xFF;
          result.child[0xFF] = buildTrieRec(splits, lower, currentBound, trial, maxDepth, ref);

          return result;
      }
  }