List of usage examples for org.apache.hadoop.mapreduce Job getNumReduceTasks
public int getNumReduceTasks()
From source file:gr.ntua.h2rdf.inputFormat.TableMapReduceUtil.java
License:Open Source License
/** * Use this before submitting a TableReduce job. It will * appropriately set up the JobConf./*www . j av a 2 s . c o m*/ * * @param table The output table. * @param reducer The reducer class to use. * @param job The current job to adjust. * @param partitioner Partitioner to use. Pass <code>null</code> to use * default partitioner. * @throws IOException When determining the region count fails. */ public static void initTableReducerJob(String table, Class<? extends TableReducer> reducer, Job job, Class partitioner) throws IOException { job.setOutputFormatClass(TableOutputFormat.class); if (reducer != null) job.setReducerClass(reducer); job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, table); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Writable.class); if (partitioner == HRegionPartitioner.class) { job.setPartitionerClass(HRegionPartitioner.class); HTable outputTable = new HTable(new HBaseConfiguration(job.getConfiguration()), table); int regions = outputTable.getRegionsInfo().size(); if (job.getNumReduceTasks() > regions) { job.setNumReduceTasks(outputTable.getRegionsInfo().size()); } } else if (partitioner != null) { job.setPartitionerClass(partitioner); } }
From source file:gr.ntua.h2rdf.inputFormat.TableMapReduceUtil.java
License:Open Source License
/** * Ensures that the given number of reduce tasks for the given job * configuration does not exceed the number of regions for the given table. * /*w w w. jav a2 s. co m*/ * @param table The table to get the region count for. * @param job The current job to adjust. * @throws IOException When retrieving the table details fails. */ public static void limitNumReduceTasks(String table, Job job) throws IOException { HTable outputTable = new HTable(new HBaseConfiguration(job.getConfiguration()), table); int regions = outputTable.getRegionsInfo().size(); if (job.getNumReduceTasks() > regions) job.setNumReduceTasks(regions); }
From source file:gr.ntua.h2rdf.inputFormat2.TableMapReduceUtil.java
License:Open Source License
/** * Use this before submitting a TableReduce job. It will * appropriately set up the JobConf.//from ww w . j ava2s .com * * @param table The output table. * @param reducer The reducer class to use. * @param job The current job to adjust. Make sure the passed job is * carrying all necessary HBase configuration. * @param partitioner Partitioner to use. Pass <code>null</code> to use * default partitioner. * @param quorumAddress Distant cluster to write to; default is null for * output to the cluster that is designated in <code>hbase-site.xml</code>. * Set this String to the zookeeper ensemble of an alternate remote cluster * when you would have the reduce write a cluster that is other than the * default; e.g. copying tables between clusters, the source would be * designated by <code>hbase-site.xml</code> and this param would have the * ensemble address of the remote cluster. The format to pass is particular. * Pass <code> <hbase.zookeeper.quorum>:<hbase.zookeeper.client.port>:<zookeeper.znode.parent> * </code> such as <code>server,server2,server3:2181:/hbase</code>. * @param serverClass redefined hbase.regionserver.class * @param serverImpl redefined hbase.regionserver.impl * @param addDependencyJars upload HBase jars and jars for any of the configured * job classes via the distributed cache (tmpjars). * @throws IOException When determining the region count fails. */ public static void initTableReducerJob(String table, Class<? extends TableReducer> reducer, Job job, Class partitioner, String quorumAddress, String serverClass, String serverImpl, boolean addDependencyJars) throws IOException { Configuration conf = job.getConfiguration(); HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf)); job.setOutputFormatClass(TableOutputFormat.class); if (reducer != null) job.setReducerClass(reducer); conf.set(TableOutputFormat.OUTPUT_TABLE, table); // If passed a quorum/ensemble address, pass it on to TableOutputFormat. if (quorumAddress != null) { // Calling this will validate the format ZKUtil.transformClusterKey(quorumAddress); conf.set(TableOutputFormat.QUORUM_ADDRESS, quorumAddress); } if (serverClass != null && serverImpl != null) { conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass); conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl); } job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Writable.class); if (partitioner == HRegionPartitioner.class) { job.setPartitionerClass(HRegionPartitioner.class); HTable outputTable = new HTable(conf, table); int regions = outputTable.getRegionsInfo().size(); if (job.getNumReduceTasks() > regions) { job.setNumReduceTasks(outputTable.getRegionsInfo().size()); } } else if (partitioner != null) { job.setPartitionerClass(partitioner); } if (addDependencyJars) { addDependencyJars(job); } initCredentials(job); }
From source file:gr.ntua.h2rdf.loadTriples.TotalOrderPartitioner.java
License:Apache License
/** * Read in the partition file and build indexing data structures. * If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and * <tt>total.order.partitioner.natural.order</tt> is not false, a trie * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes * will be built. Otherwise, keys will be located using a binary search of * the partition keyset using the {@link org.apache.hadoop.io.RawComparator} * defined for this job. The input file must be sorted with the same * comparator and contain {@link Job#getNumReduceTasks()} - 1 keys. *//*from www . j a va 2 s. c om*/ @SuppressWarnings("unchecked") // keytype from conf not static public void setConf(Configuration conf) { try { this.conf = conf; String parts = getPartitionFile(conf); final Path partFile = new Path(parts); final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(conf) // assume in DistributedCache : partFile.getFileSystem(conf); Job job = new Job(conf); Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass(); K[] splitPoints = readPartitions(fs, partFile, keyClass, conf); if (splitPoints.length > job.getNumReduceTasks() - 1) { System.out.println(job.getNumReduceTasks()); System.out.println(splitPoints.length); System.out.println("Wrong number of partitions in keyset:"); throw new IOException("Wrong number of partitions in keyset:" + splitPoints.length); } RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator(); for (int i = 0; i < splitPoints.length - 1; ++i) { if (comparator.compare(splitPoints[i], splitPoints[i + 1]) >= 0) { throw new IOException("Split points are out of order"); } } boolean natOrder = conf.getBoolean(NATURAL_ORDER, true); if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) { partitions = buildTrie((BinaryComparable[]) splitPoints, 0, splitPoints.length, new byte[0], // Now that blocks of identical splitless trie nodes are // represented reentrantly, and we develop a leaf for any trie // node with only one split point, the only reason for a depth // limit is to refute stack overflow or bloat in the pathological // case where the split points are long and mostly look like bytes // iii...iixii...iii . Therefore, we make the default depth // limit large but not huge. conf.getInt(MAX_TRIE_DEPTH, 200)); } else { partitions = new BinarySearchNode(splitPoints, comparator); } } catch (IOException e) { throw new IllegalArgumentException("Can't read partitions file", e); } }
From source file:hu.sztaki.ilab.bigdata.common.tools.InputSampler.java
License:Apache License
/** * Write a partition file for the given job, using the Sampler provided. * Queries the sampler for a sample keyset, sorts by the output key * comparator, selects the keys for each rank, and writes to the destination * returned from {@link TotalOrderPartitioner#getPartitionFile}. *///from w w w . ja va2 s . c o m @SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = job.getConfiguration(); final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); int numPartitions = job.getNumReduceTasks(); K[] samples = sampler.getSample(inf, job); LOG.info("Using " + samples.length + " samples"); RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator(); Arrays.sort(samples, comparator); Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf)); FileSystem fs = dst.getFileSystem(conf); if (fs.exists(dst)) { fs.delete(dst, false); } SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class); NullWritable nullValue = NullWritable.get(); float stepSize = samples.length / (float) numPartitions; int last = -1; for (int i = 1; i < numPartitions; ++i) { int k = Math.round(stepSize * i); while (last >= k && comparator.compare(samples[last], samples[k]) == 0) { ++k; } writer.append(samples[k], nullValue); last = k; } writer.close(); }
From source file:hu.sztaki.ilab.bigdata.common.tools.InputSampler.java
License:Apache License
/** * Driver for InputSampler from the command line. * Configures a JobConf instance and calls {@link #writePartitionFile}. *///from ww w.j ava 2s.c om public int run(String[] args) throws Exception { Job job = new Job(getConf()); ArrayList<String> otherArgs = new ArrayList<String>(); Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-r".equals(args[i])) { job.setNumReduceTasks(Integer.parseInt(args[++i])); } else if ("-inFormat".equals(args[i])) { job.setInputFormatClass(Class.forName(args[++i]).asSubclass(InputFormat.class)); } else if ("-keyClass".equals(args[i])) { job.setMapOutputKeyClass(Class.forName(args[++i]).asSubclass(WritableComparable.class)); } else if ("-splitSample".equals(args[i])) { int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new SplitSampler<K, V>(numSamples, maxSplits); } else if ("-splitRandom".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else if ("-splitInterval".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new IntervalSampler<K, V>(pcnt, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (job.getNumReduceTasks() <= 1) { System.err.println("Sampler requires more than one reducer"); return printUsage(); } if (otherArgs.size() < 2) { System.out.println("ERROR: Wrong number of parameters: "); return printUsage(); } if (null == sampler) { sampler = new RandomSampler<K, V>(0.1, 10000, 10); } Path outf = new Path(otherArgs.remove(otherArgs.size() - 1)); TotalOrderPartitioner.setPartitionFile(getConf(), outf); for (String s : otherArgs) { FileInputFormat.addInputPath(job, new Path(s)); } InputSampler.<K, V>writePartitionFile(job, sampler); return 0; }
From source file:org.apache.avro.mapreduce.AvroMultipleOutputs.java
License:Apache License
private void setSchema(Job job, Schema keySchema, Schema valSchema) { boolean isMaponly = job.getNumReduceTasks() == 0; if (keySchema != null) { if (isMaponly) AvroJob.setMapOutputKeySchema(job, keySchema); else//from w ww . j a va 2s .co m AvroJob.setOutputKeySchema(job, keySchema); } if (valSchema != null) { if (isMaponly) AvroJob.setMapOutputValueSchema(job, valSchema); else AvroJob.setOutputValueSchema(job, valSchema); } }
From source file:org.apache.blur.mapreduce.lib.CsvBlurDriverTest.java
License:Apache License
@Test public void multiplierParamShouldIncreaseReduceTasks() throws Exception { Configuration configurationSetup = new Configuration(); ControllerPool controllerPool = new CsvBlurDriver.ControllerPool() { @Override/*ww w. jav a2 s . c o m*/ public Iface getClient(String controllerConnectionStr) { return getMockIface(); } }; int multiplierParam = 10; AtomicReference<Callable<Void>> ref = new AtomicReference<Callable<Void>>(); Job job = CsvBlurDriver.setupJob(configurationSetup, controllerPool, ref, "-c", "host:40010", "-d", "family1", "col1", "col2", "-d", "family2", "col3", "col4", "-t", "table1", "-i", _path1.toString(), "-i", _path2.toString(), "-S", "-C", "1000000", "2000000", "-p", "SNAPPY", "-r", Integer.toString(multiplierParam)); assertNotNull(job); assertEquals(multiplierParam * shardCount, job.getNumReduceTasks()); }
From source file:org.apache.crunch.lib.sort.TotalOrderPartitioner.java
License:Apache License
@Override public void setConf(Configuration conf) { try {//from ww w.j ava 2 s. c o m this.conf = conf; String parts = getPartitionFile(conf); final Path partFile = new Path(parts); final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(conf) // assume in DistributedCache : partFile.getFileSystem(conf); Job job = new Job(conf); Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass(); RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator(); K[] splitPoints = readPartitions(fs, partFile, keyClass, conf, comparator); int numReduceTasks = job.getNumReduceTasks(); if (splitPoints.length != numReduceTasks - 1) { throw new IOException("Wrong number of partitions in keyset"); } partitions = new BinarySearchNode(splitPoints, comparator); } catch (IOException e) { throw new IllegalArgumentException("Can't read partitions file", e); } }
From source file:org.apache.jena.tdbloader4.partitioners.InputSampler.java
License:Apache License
/** * Write a partition file for the given job, using the Sampler provided. * Queries the sampler for a sample keyset, sorts by the output key * comparator, selects the keys for each rank, and writes to the destination * returned from {@link TotalOrderPartitioner#getPartitionFile}. *//*from www . j a v a 2 s . c o m*/ @SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler) throws IOException, ClassNotFoundException, InterruptedException { log.debug("writePartitionFile({},{})", job, sampler); Configuration conf = job.getConfiguration(); @SuppressWarnings("rawtypes") final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); int numPartitions = job.getNumReduceTasks() / 9; log.debug("Number of partitions is {} for each index", numPartitions); K[] samples = sampler.getSample(inf, job); log.info("Using " + samples.length + " samples"); writePartitionFile(samples, "GSPO", job, conf, numPartitions); writePartitionFile(samples, "GPOS", job, conf, numPartitions); writePartitionFile(samples, "GOSP", job, conf, numPartitions); writePartitionFile(samples, "SPOG", job, conf, numPartitions); writePartitionFile(samples, "POSG", job, conf, numPartitions); writePartitionFile(samples, "OSPG", job, conf, numPartitions); writePartitionFile(samples, "SPO", job, conf, numPartitions); writePartitionFile(samples, "POS", job, conf, numPartitions); writePartitionFile(samples, "OSP", job, conf, numPartitions); }