Example usage for org.apache.hadoop.mapreduce Job getNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getNumReduceTasks.

Prototype

public int getNumReduceTasks()

Source Link

Document

Get configured the number of reduce tasks for this job.

Usage

From source file:gr.ntua.h2rdf.inputFormat.TableMapReduceUtil.java

License:Open Source License

/**
 * Use this before submitting a TableReduce job. It will
 * appropriately set up the JobConf./*www . j  av  a  2 s  .  c  o  m*/
 * 
 * @param table  The output table.
 * @param reducer  The reducer class to use.
 * @param job  The current job to adjust.
 * @param partitioner  Partitioner to use. Pass <code>null</code> to use 
 * default partitioner.
 * @throws IOException When determining the region count fails. 
 */
public static void initTableReducerJob(String table, Class<? extends TableReducer> reducer, Job job,
        Class partitioner) throws IOException {
    job.setOutputFormatClass(TableOutputFormat.class);
    if (reducer != null)
        job.setReducerClass(reducer);
    job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, table);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(Writable.class);
    if (partitioner == HRegionPartitioner.class) {
        job.setPartitionerClass(HRegionPartitioner.class);
        HTable outputTable = new HTable(new HBaseConfiguration(job.getConfiguration()), table);
        int regions = outputTable.getRegionsInfo().size();
        if (job.getNumReduceTasks() > regions) {
            job.setNumReduceTasks(outputTable.getRegionsInfo().size());
        }
    } else if (partitioner != null) {
        job.setPartitionerClass(partitioner);
    }
}

From source file:gr.ntua.h2rdf.inputFormat.TableMapReduceUtil.java

License:Open Source License

/**
 * Ensures that the given number of reduce tasks for the given job 
 * configuration does not exceed the number of regions for the given table. 
 * /*w  w w.  jav a2 s. co  m*/
 * @param table  The table to get the region count for.
 * @param job  The current job to adjust.
 * @throws IOException When retrieving the table details fails.
 */
public static void limitNumReduceTasks(String table, Job job) throws IOException {
    HTable outputTable = new HTable(new HBaseConfiguration(job.getConfiguration()), table);
    int regions = outputTable.getRegionsInfo().size();
    if (job.getNumReduceTasks() > regions)
        job.setNumReduceTasks(regions);
}

From source file:gr.ntua.h2rdf.inputFormat2.TableMapReduceUtil.java

License:Open Source License

/**
 * Use this before submitting a TableReduce job. It will
 * appropriately set up the JobConf.//from   ww w . j  ava2s  .com
 *
 * @param table  The output table.
 * @param reducer  The reducer class to use.
 * @param job  The current job to adjust.  Make sure the passed job is
 * carrying all necessary HBase configuration.
 * @param partitioner  Partitioner to use. Pass <code>null</code> to use
 * default partitioner.
 * @param quorumAddress Distant cluster to write to; default is null for
 * output to the cluster that is designated in <code>hbase-site.xml</code>.
 * Set this String to the zookeeper ensemble of an alternate remote cluster
 * when you would have the reduce write a cluster that is other than the
 * default; e.g. copying tables between clusters, the source would be
 * designated by <code>hbase-site.xml</code> and this param would have the
 * ensemble address of the remote cluster.  The format to pass is particular.
 * Pass <code> &lt;hbase.zookeeper.quorum>:&lt;hbase.zookeeper.client.port>:&lt;zookeeper.znode.parent>
 * </code> such as <code>server,server2,server3:2181:/hbase</code>.
 * @param serverClass redefined hbase.regionserver.class
 * @param serverImpl redefined hbase.regionserver.impl
 * @param addDependencyJars upload HBase jars and jars for any of the configured
 *           job classes via the distributed cache (tmpjars).
 * @throws IOException When determining the region count fails.
 */
public static void initTableReducerJob(String table, Class<? extends TableReducer> reducer, Job job,
        Class partitioner, String quorumAddress, String serverClass, String serverImpl,
        boolean addDependencyJars) throws IOException {

    Configuration conf = job.getConfiguration();
    HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
    job.setOutputFormatClass(TableOutputFormat.class);
    if (reducer != null)
        job.setReducerClass(reducer);
    conf.set(TableOutputFormat.OUTPUT_TABLE, table);
    // If passed a quorum/ensemble address, pass it on to TableOutputFormat.
    if (quorumAddress != null) {
        // Calling this will validate the format
        ZKUtil.transformClusterKey(quorumAddress);
        conf.set(TableOutputFormat.QUORUM_ADDRESS, quorumAddress);
    }
    if (serverClass != null && serverImpl != null) {
        conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
        conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);
    }
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(Writable.class);
    if (partitioner == HRegionPartitioner.class) {
        job.setPartitionerClass(HRegionPartitioner.class);
        HTable outputTable = new HTable(conf, table);
        int regions = outputTable.getRegionsInfo().size();
        if (job.getNumReduceTasks() > regions) {
            job.setNumReduceTasks(outputTable.getRegionsInfo().size());
        }
    } else if (partitioner != null) {
        job.setPartitionerClass(partitioner);
    }

    if (addDependencyJars) {
        addDependencyJars(job);
    }

    initCredentials(job);
}

From source file:gr.ntua.h2rdf.loadTriples.TotalOrderPartitioner.java

License:Apache License

/**
 * Read in the partition file and build indexing data structures.
 * If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and
 * <tt>total.order.partitioner.natural.order</tt> is not false, a trie
 * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
 * will be built. Otherwise, keys will be located using a binary search of
 * the partition keyset using the {@link org.apache.hadoop.io.RawComparator}
 * defined for this job. The input file must be sorted with the same
 * comparator and contain {@link Job#getNumReduceTasks()} - 1 keys.
 *//*from   www  .  j  a va 2  s.  c om*/
@SuppressWarnings("unchecked") // keytype from conf not static
public void setConf(Configuration conf) {
    try {
        this.conf = conf;
        String parts = getPartitionFile(conf);
        final Path partFile = new Path(parts);
        final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(conf) // assume in DistributedCache
                : partFile.getFileSystem(conf);

        Job job = new Job(conf);
        Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass();
        K[] splitPoints = readPartitions(fs, partFile, keyClass, conf);
        if (splitPoints.length > job.getNumReduceTasks() - 1) {
            System.out.println(job.getNumReduceTasks());
            System.out.println(splitPoints.length);
            System.out.println("Wrong number of partitions in keyset:");
            throw new IOException("Wrong number of partitions in keyset:" + splitPoints.length);
        }
        RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
        for (int i = 0; i < splitPoints.length - 1; ++i) {
            if (comparator.compare(splitPoints[i], splitPoints[i + 1]) >= 0) {
                throw new IOException("Split points are out of order");
            }
        }
        boolean natOrder = conf.getBoolean(NATURAL_ORDER, true);
        if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) {
            partitions = buildTrie((BinaryComparable[]) splitPoints, 0, splitPoints.length, new byte[0],
                    // Now that blocks of identical splitless trie nodes are 
                    // represented reentrantly, and we develop a leaf for any trie
                    // node with only one split point, the only reason for a depth
                    // limit is to refute stack overflow or bloat in the pathological
                    // case where the split points are long and mostly look like bytes 
                    // iii...iixii...iii   .  Therefore, we make the default depth
                    // limit large but not huge.
                    conf.getInt(MAX_TRIE_DEPTH, 200));
        } else {
            partitions = new BinarySearchNode(splitPoints, comparator);
        }
    } catch (IOException e) {
        throw new IllegalArgumentException("Can't read partitions file", e);
    }
}

From source file:hu.sztaki.ilab.bigdata.common.tools.InputSampler.java

License:Apache License

/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link TotalOrderPartitioner#getPartitionFile}.
 *///from  w w w  .  ja va2  s  . c o m
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler)
        throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = job.getConfiguration();
    final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
    int numPartitions = job.getNumReduceTasks();
    K[] samples = sampler.getSample(inf, job);
    LOG.info("Using " + samples.length + " samples");
    RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
    Arrays.sort(samples, comparator);
    Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
    FileSystem fs = dst.getFileSystem(conf);
    if (fs.exists(dst)) {
        fs.delete(dst, false);
    }
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(),
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    float stepSize = samples.length / (float) numPartitions;
    int last = -1;
    for (int i = 1; i < numPartitions; ++i) {
        int k = Math.round(stepSize * i);
        while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
            ++k;
        }
        writer.append(samples[k], nullValue);
        last = k;
    }
    writer.close();
}

From source file:hu.sztaki.ilab.bigdata.common.tools.InputSampler.java

License:Apache License

/**
 * Driver for InputSampler from the command line.
 * Configures a JobConf instance and calls {@link #writePartitionFile}.
 *///from   ww w.j  ava 2s.c  om
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    ArrayList<String> otherArgs = new ArrayList<String>();
    Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-r".equals(args[i])) {
                job.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else if ("-inFormat".equals(args[i])) {
                job.setInputFormatClass(Class.forName(args[++i]).asSubclass(InputFormat.class));
            } else if ("-keyClass".equals(args[i])) {
                job.setMapOutputKeyClass(Class.forName(args[++i]).asSubclass(WritableComparable.class));
            } else if ("-splitSample".equals(args[i])) {
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new SplitSampler<K, V>(numSamples, maxSplits);
            } else if ("-splitRandom".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else if ("-splitInterval".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new IntervalSampler<K, V>(pcnt, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    if (job.getNumReduceTasks() <= 1) {
        System.err.println("Sampler requires more than one reducer");
        return printUsage();
    }
    if (otherArgs.size() < 2) {
        System.out.println("ERROR: Wrong number of parameters: ");
        return printUsage();
    }
    if (null == sampler) {
        sampler = new RandomSampler<K, V>(0.1, 10000, 10);
    }

    Path outf = new Path(otherArgs.remove(otherArgs.size() - 1));
    TotalOrderPartitioner.setPartitionFile(getConf(), outf);
    for (String s : otherArgs) {
        FileInputFormat.addInputPath(job, new Path(s));
    }
    InputSampler.<K, V>writePartitionFile(job, sampler);

    return 0;
}

From source file:org.apache.avro.mapreduce.AvroMultipleOutputs.java

License:Apache License

private void setSchema(Job job, Schema keySchema, Schema valSchema) {

    boolean isMaponly = job.getNumReduceTasks() == 0;
    if (keySchema != null) {
        if (isMaponly)
            AvroJob.setMapOutputKeySchema(job, keySchema);
        else//from  w ww .  j a va 2s  .co m
            AvroJob.setOutputKeySchema(job, keySchema);
    }
    if (valSchema != null) {
        if (isMaponly)
            AvroJob.setMapOutputValueSchema(job, valSchema);
        else
            AvroJob.setOutputValueSchema(job, valSchema);
    }

}

From source file:org.apache.blur.mapreduce.lib.CsvBlurDriverTest.java

License:Apache License

@Test
public void multiplierParamShouldIncreaseReduceTasks() throws Exception {
    Configuration configurationSetup = new Configuration();
    ControllerPool controllerPool = new CsvBlurDriver.ControllerPool() {
        @Override/*ww  w.  jav a2 s . c  o  m*/
        public Iface getClient(String controllerConnectionStr) {
            return getMockIface();
        }
    };
    int multiplierParam = 10;
    AtomicReference<Callable<Void>> ref = new AtomicReference<Callable<Void>>();
    Job job = CsvBlurDriver.setupJob(configurationSetup, controllerPool, ref, "-c", "host:40010", "-d",
            "family1", "col1", "col2", "-d", "family2", "col3", "col4", "-t", "table1", "-i", _path1.toString(),
            "-i", _path2.toString(), "-S", "-C", "1000000", "2000000", "-p", "SNAPPY", "-r",
            Integer.toString(multiplierParam));
    assertNotNull(job);

    assertEquals(multiplierParam * shardCount, job.getNumReduceTasks());
}

From source file:org.apache.crunch.lib.sort.TotalOrderPartitioner.java

License:Apache License

@Override
public void setConf(Configuration conf) {
    try {//from ww w.j  ava 2 s.  c o m
        this.conf = conf;
        String parts = getPartitionFile(conf);
        final Path partFile = new Path(parts);
        final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(conf) // assume in DistributedCache
                : partFile.getFileSystem(conf);

        Job job = new Job(conf);
        Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass();
        RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
        K[] splitPoints = readPartitions(fs, partFile, keyClass, conf, comparator);
        int numReduceTasks = job.getNumReduceTasks();
        if (splitPoints.length != numReduceTasks - 1) {
            throw new IOException("Wrong number of partitions in keyset");
        }
        partitions = new BinarySearchNode(splitPoints, comparator);
    } catch (IOException e) {
        throw new IllegalArgumentException("Can't read partitions file", e);
    }
}

From source file:org.apache.jena.tdbloader4.partitioners.InputSampler.java

License:Apache License

/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link TotalOrderPartitioner#getPartitionFile}.
 *//*from   www . j  a v  a  2 s  . c o m*/
@SuppressWarnings("unchecked")
// getInputFormat, getOutputKeyComparator
public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler)
        throws IOException, ClassNotFoundException, InterruptedException {
    log.debug("writePartitionFile({},{})", job, sampler);
    Configuration conf = job.getConfiguration();
    @SuppressWarnings("rawtypes")
    final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
    int numPartitions = job.getNumReduceTasks() / 9;
    log.debug("Number of partitions is {} for each index", numPartitions);
    K[] samples = sampler.getSample(inf, job);
    log.info("Using " + samples.length + " samples");
    writePartitionFile(samples, "GSPO", job, conf, numPartitions);
    writePartitionFile(samples, "GPOS", job, conf, numPartitions);
    writePartitionFile(samples, "GOSP", job, conf, numPartitions);
    writePartitionFile(samples, "SPOG", job, conf, numPartitions);
    writePartitionFile(samples, "POSG", job, conf, numPartitions);
    writePartitionFile(samples, "OSPG", job, conf, numPartitions);
    writePartitionFile(samples, "SPO", job, conf, numPartitions);
    writePartitionFile(samples, "POS", job, conf, numPartitions);
    writePartitionFile(samples, "OSP", job, conf, numPartitions);
}