Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException 

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:SecondarySort.HashToMinWithSS.java

protected Job jobConfig() throws IOException {
    JobConf conf = new JobConf();
    Job job = new Job(conf, "iteration");
    job.setJarByClass(HashToMinWithSS.class);
    job.setReducerClass(ReduceSS.class);
    job.setPartitionerClass(LongPair.HPartitioner.class);
    job.setSortComparatorClass(LongPair.Comparator.class);
    job.setGroupingComparatorClass(LongPair.GroupComparator.class);
    job.setOutputKeyClass(LongPair.class);
    job.setOutputValueClass(Text.class);
    return job;/*  www  .  j  a  v  a2 s .c o  m*/
}

From source file:SecondarySort.inputMaker.java

License:Open Source License

protected Job jobConfig() throws IOException {
    JobConf conf = new JobConf();
    Job job = new Job(conf, "iteration");
    job.setJarByClass(inputMaker.class);
    job.setMapperClass(MapMSS.class);
    job.setReducerClass(ReduceSS.class);
    job.setPartitionerClass(LongPair.HPartitioner.class);
    job.setSortComparatorClass(LongPair.Comparator.class);
    job.setGroupingComparatorClass(LongPair.GroupComparator.class);
    job.setOutputKeyClass(LongPair.class);
    job.setOutputValueClass(Text.class);
    return job;/*from   w w  w .ja v  a 2  s .  co  m*/
}

From source file:simsql.runtime.MRLoader.java

License:Apache License

public long run(String inputPath, String outputPath, short typeCode, Relation r, int sortAtt) {

    // make a directory for the relation
    Configuration conf = new Configuration();
    FileSystem dfs = null;/* w ww.j a  va2s  .  com*/

    try {
        dfs = FileSystem.get(conf);
    } catch (Exception e) {
        throw new RuntimeException("Cannot access HDFS!", e);
    }

    try {
        // if it exists, destroy it.
        Path path = new Path(outputPath);
        if (dfs.exists(path)) {
            dfs.delete(path, true);
        }
    } catch (Exception e) {
        throw new RuntimeException("Could not create the file to bulk load to!", e);
    }

    // find a file name 
    String tempPath = null;
    if (inputPath.startsWith("hdfs:")) {
        tempPath = inputPath.replace("hdfs:", "");
    } else {
        tempPath = "/tempDataFile_" + r.getName();
        try {
            dfs.delete(new Path(tempPath), true);
        } catch (Exception e) {
            // ignore this.
        }

        // upload the text file
        try {
            dfs.copyFromLocalFile(false, true, new Path(inputPath), new Path(tempPath));
            dfs.deleteOnExit(new Path(tempPath));
        } catch (Exception e) {
            throw new RuntimeException("Failed to upload text file " + inputPath + " to HDFS!", e);
        }
    }

    // set up the new job's parameters.
    conf.setBoolean("mapred.compress.map.output", true);
    conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass());

    conf.set("io.serializations",
            "simsql.runtime.RecordSerialization,simsql.runtime.RecordKeySerialization,org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt("simsql.loader.numAtts", r.getAttributes().size());
    conf.setInt("simsql.loader.typeCode", (int) typeCode);
    conf.setInt("simsql.loader.sortAtt", sortAtt);

    String[] myStrings = new String[r.getAttributes().size()];
    int j = 0;
    for (simsql.compiler.Attribute a : r.getAttributes()) {
        myStrings[j++] = a.getPhysicalRealization().getClass().getName();
    }

    conf.setStrings("simsql.loader.types", myStrings);

    // create a job
    Job job;
    try {
        job = new Job(conf);
    } catch (Exception e) {
        throw new RuntimeException("Unable to create bulk loading job!", e);
    }

    // set the split size (number of mappers)
    long fSize = 0;
    if (inputPath.startsWith("hdfs")) {
        fSize = RelOp.getPathsTotalSize(new String[] { tempPath });
    } else {
        fSize = new File(inputPath).length();
    }

    FileInputFormat.setMinInputSplitSize(job, fSize / (long) numTasks);
    FileInputFormat.setMaxInputSplitSize(job, fSize / (long) numTasks);

    // and the number of reducers
    job.setNumReduceTasks(numTasks);

    // the mapper/reducer/jar
    job.setMapperClass(MRLoaderMapper.class);
    job.setReducerClass(MRLoaderReducer.class);
    job.setJarByClass(MRLoader.class);

    // I/O settings.
    job.setOutputFormatClass(RecordOutputFormat.class);

    job.setMapOutputKeyClass(RecordKey.class);
    job.setMapOutputValueClass(RecordWrapper.class);
    job.setOutputKeyClass(Nothing.class);
    job.setOutputValueClass(Record.class);
    try {
        FileInputFormat.setInputPaths(job, new Path(tempPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));
    } catch (Exception e) {
        throw new RuntimeException("Could not set job inputs/outputs", e);
    }
    job.setGroupingComparatorClass(RecordKeyGroupingComparator.class);
    job.setPartitionerClass(RecordPartitioner.class);
    job.setSortComparatorClass(RecordKeySortComparator.class);

    job.setJobName("MRLoader: " + inputPath + " ==> " + outputPath);

    // run it
    Counters counters;
    try {
        job.waitForCompletion(true);
        counters = job.getCounters();
    } catch (Exception e) {
        throw new RuntimeException("Could not set up bulk loader job!", e);
    }

    // now, delete all the empty part files
    try {

        // get a filesystem
        FileSystem ddfs = FileSystem.get(conf);
        Path outPath = new Path(outputPath);
        if (ddfs.exists(outPath) && ddfs.isDirectory(outPath)) {
            FileStatus fstatus[] = ddfs.listStatus(outPath, new TableFileFilter());
            for (FileStatus ff : fstatus) {
                if (ddfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around...
                    ddfs.delete(ff.getPath(), true);
                }
            }
        }
    } catch (Exception e) { // this isn't disastrous 
    }

    // get the counter for the output of the mapper.
    Counter bytesCounter = counters.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN);
    return bytesCounter.getValue();
}

From source file:simsql.runtime.RelOp.java

License:Apache License

public boolean run(RuntimeParameter params, boolean verbose) {

    ExampleRuntimeParameter pp = (ExampleRuntimeParameter) params;

    // build the jar.
    String jarFile = buildJarFile(params);

    // Get the default configuration object
    Configuration conf = new Configuration();

    // set quite mode on/off
    conf.setQuietMode(!verbose);//from  w w  w .j a  va  2s. c o m

    /***
    conf.setBoolean("mapred.task.profile", true);
    conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples," +
        "heap=sites,depth=8,force=n,thread=y,verbose=n,file=%s");
    ***/

    // tell it how to serialize and deserialize records and recordkeys
    conf.set("io.serializations", getSerializations());
    conf.setBoolean("mapred.compress.map.output", true);

    int ioSortMB = conf.getInt("io.sort.mb", 256);
    conf.set("mapred.map.child.java.opts", "-Xmx" + (getMemPerMapper(params) + ioSortMB) + "m -Xms"
            + (getMemPerMapper(params))
            + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log");

    conf.set("mapred.reduce.child.java.opts", "-Xmx" + (getMemPerReducer(params) + ioSortMB) + "m -Xms"
            + (getMemPerMapper(params))
            + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log");

    conf.setInt("simsql.input.numSplits", pp.getNumCPUs());
    conf.setInt("mapred.job.reuse.jvm.num.tasks", 1);
    // conf.setBoolean ("mapred.map.tasks.speculative.execution", false);
    // conf.setBoolean ("mapred.reduce.tasks.speculative.execution", false);

    // tell it to use the jar that we just created
    conf.set("mapred.jar", jarFile);

    // conf.set("tmpjars", "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-core.jar");

    conf.setBoolean("mapred.output.compress", true);
    conf.setStrings("mapred.output.compression.type", new String[] { "RECORD" });

    // use snappy for the intermediate stuff
    conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass());

    // do some additional operator-specific configurations
    setConfigurations(conf, params);

    // collect statistics for final relations always
    conf.setBoolean("simsql.collectStats", isFinal || collectStats);

    // figure out what file to map
    String[] inDirs = myInputNetwork.getPipelinedInputFiles();
    inDirs = excludeAnyWhoWillNotBeMapped(inDirs);
    String inSingleString = inDirs[0];
    conf.set("simsql.fileToMap", inSingleString);
    for (int i = 1; i < inDirs.length; i++) {
        inSingleString += "," + inDirs[i];
    }

    // create and name the job
    Job job;
    try {
        job = new Job(conf);
    } catch (Exception e) {
        throw new RuntimeException("Unable to create a new job!", e);
    }

    job.setJobName(getJobName());

    // set the map-reduce input and output types
    job.setMapOutputKeyClass(getMapOutputKeyClass());
    job.setMapOutputValueClass(getMapOutputValueClass());
    job.setOutputKeyClass(getOutputKeyClass());
    job.setOutputValueClass(getOutputValueClass());

    int numReducers = getNumReducers(params);

    job.setMapperClass(getMapperClass());
    job.setReducerClass(getReducerClass());

    // set the number of reducers
    job.setNumReduceTasks(numReducers);

    // set the input and the output formats... these extend FileInputFormat and FileOutputFormat
    job.setInputFormatClass(getInputFormatClass());
    job.setOutputFormatClass(getOutputFormatClass());

    // set the input and output paths
    try {
        System.out.println("input file: " + inSingleString);
        FileInputFormat.setInputPaths(job, inSingleString);
        FileInputFormat.setInputPathFilter(job, TableFileFilter.class);
        FileOutputFormat.setOutputPath(job, new Path(getOutput()));
    } catch (Exception e) {
        throw new RuntimeException("Unable to set up the input/output path for the job.", e);
    }

    // set the split size
    FileInputFormat.setMinInputSplitSize(job, getSplitSize(params));
    FileInputFormat.setMaxInputSplitSize(job, getSplitSize(params));

    // set the various sorting/grouping/mapping classes
    job.setGroupingComparatorClass(getGroupingComparatorClass());
    job.setPartitionerClass(getPartitionerClass());
    job.setSortComparatorClass(getSortComparatorClass());

    // and now, submit the job and wait for things to finish
    int exitCode;
    try {
        exitCode = job.waitForCompletion(verbose) ? 0 : 1;

        // get the output bytes counter.
        Counters c = job.getCounters();
        Counter mx = c.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN);

        // and use them to set the size of the output relation.
        if (myDB != null) {
            myDB.setTableSize(myDB.getTableName(getOutput()), mx.getValue());
            myDB.setNumAtts(myDB.getTableName(getOutput()), getOutputAttNames().length);
        }

    } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException("Unable to run the job", e);
    }

    // now, delete all the empty part files
    try {

        // get a filesystem
        FileSystem dfs = FileSystem.get(conf);
        Path outPath = new Path(getOutput());
        if (dfs.exists(outPath) && dfs.isDirectory(outPath)) {
            FileStatus fstatus[] = dfs.listStatus(outPath, new TableFileFilter());
            for (FileStatus ff : fstatus) {
                if (dfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around...
                    dfs.delete(ff.getPath(), true);
                }
            }
        }
    } catch (Exception e) { // this isn't disastrous 
    }
    return (exitCode == 0);
}

From source file:sixdegrees.AdjacencyList.java

License:Apache License

public void execute(String args[]) throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: adjacencylist <in> <out>");
        System.exit(2);/*from  w  w w.  j  ava  2 s. c  o m*/
    }
    Job job = new Job(conf, "Adjacency List");
    job.setJarByClass(AdjacencyList.class);
    job.setMapperClass(AdjacencyMapper.class);
    job.setPartitionerClass(CustomPartitioner.class);
    job.setReducerClass(AdjacencyReducer.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setNumReduceTasks(6);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1] + "0"));
    job.waitForCompletion(true);
}

From source file:sourcefiles.PartitionGraph.java

License:Apache License

/**
 * Runs this tool./* w ww  .  j ava 2  s  .com*/
 */
@Override
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(RANGE, "use range partitioner"));

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of partitions")
            .create(NUM_PARTITIONS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES)
            || !cmdline.hasOption(NUM_PARTITIONS)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inPath = cmdline.getOptionValue(INPUT);
    String outPath = cmdline.getOptionValue(OUTPUT);
    int nodeCount = Integer.parseInt(cmdline.getOptionValue(NUM_NODES));
    int numParts = Integer.parseInt(cmdline.getOptionValue(NUM_PARTITIONS));
    boolean useRange = cmdline.hasOption(RANGE);

    LOG.info("Tool name: " + PartitionGraph.class.getSimpleName());
    LOG.info(" - input dir: " + inPath);
    LOG.info(" - output dir: " + outPath);
    LOG.info(" - num partitions: " + numParts);
    LOG.info(" - node cnt: " + nodeCount);
    LOG.info(" - use range partitioner: " + useRange);

    Configuration conf = getConf();
    conf.setInt("NodeCount", nodeCount);

    Job job = Job.getInstance(conf);
    job.setJobName(PartitionGraph.class.getSimpleName() + ":" + inPath);
    job.setJarByClass(PartitionGraph.class);

    job.setNumReduceTasks(numParts);

    FileInputFormat.setInputPaths(job, new Path(inPath));
    FileOutputFormat.setOutputPath(job, new Path(outPath));

    job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PageRankNodeEnhanced.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PageRankNodeEnhanced.class);

    if (useRange) {
        job.setPartitionerClass(RangePartitioner.class);
    }

    FileSystem.get(conf).delete(new Path(outPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:terasort.TeraSort.java

License:Apache License

public int run(String[] args) throws Exception {
    LOG.info("starting");
    Job job = Job.getInstance(getConf());
    Path inputDir = new Path(args[0]);

    Path outputDir = new Path(args[1]);
    FileSystem fs = FileSystem.get(getConf());
    if (fs.exists(outputDir)) {
        fs.delete(outputDir, true);/* w w  w.j a v a  2  s . co  m*/
    }
    boolean useSimplePartitioner = getUseSimplePartitioner(job);
    TeraInputFormat.setInputPaths(job, inputDir);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setJobName("TeraSort");
    job.setJarByClass(TeraSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(TeraInputFormat.class);
    job.setOutputFormatClass(TeraOutputFormat.class);
    if (useSimplePartitioner) {
        job.setPartitionerClass(SimplePartitioner.class);
    } else {
        long start = System.currentTimeMillis();
        Path partitionFile = new Path(outputDir, TeraInputFormat.PARTITION_FILENAME);
        URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME);
        try {
            TeraInputFormat.writePartitionFile(job, partitionFile);
        } catch (Throwable e) {
            LOG.error(e.getMessage());
            return -1;
        }
        job.addCacheFile(partitionUri);
        long end = System.currentTimeMillis();
        System.out.println("Spent " + (end - start) + "ms computing partitions.");
        job.setPartitionerClass(TotalOrderPartitioner.class);
    }

    job.getConfiguration().setInt("dfs.replication", getOutputReplication(job));
    TeraOutputFormat.setFinalSync(job, true);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    LOG.info("done");
    return ret;
}

From source file:uk.ac.cam.eng.extraction.hadoop.features.phrase.Source2TargetJob.java

License:Apache License

public static Job getJob(Configuration conf) throws IOException {
    conf.set("mapred.map.child.java.opts", "-Xmx200m");
    conf.set("mapred.reduce.child.java.opts", "-Xmx5128m");
    conf.setBoolean(MarginalReducer.SOURCE_TO_TARGET, true);
    Job job = new Job(conf);
    job.setJarByClass(Source2TargetJob.class);
    job.setJobName("Source2Taget");
    job.setSortComparatorClass(Source2TargetComparator.class);
    job.setPartitionerClass(Source2TargetPartitioner.class);
    job.setMapperClass(KeepProvenanceCountsOnlyMapper.class);
    job.setReducerClass(MarginalReducer.class);
    job.setMapOutputKeyClass(RuleWritable.class);
    job.setMapOutputValueClass(ProvenanceCountMap.class);
    job.setOutputKeyClass(RuleWritable.class);
    job.setOutputValueClass(FeatureMap.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    return job;//from  w  ww  .  j a v a  2s . c  om
}

From source file:uk.ac.cam.eng.extraction.hadoop.features.phrase.Target2SourceJob.java

License:Apache License

public static Job getJob(Configuration conf) throws IOException {
    conf.set("mapred.map.child.java.opts", "-Xmx200m");
    conf.set("mapred.reduce.child.java.opts", "-Xmx5128m");
    conf.setBoolean(MarginalReducer.SOURCE_TO_TARGET, false);
    Job job = new Job(conf);
    job.setJarByClass(Target2SourceJob.class);
    job.setJobName("Target2Source");
    job.setSortComparatorClass(Target2SourceComparator.class);
    job.setPartitionerClass(Target2SourcePartitioner.class);
    job.setMapperClass(SwappingMapper.class);
    job.setReducerClass(MarginalReducer.class);
    job.setMapOutputKeyClass(RuleWritable.class);
    job.setMapOutputValueClass(ProvenanceCountMap.class);
    job.setOutputKeyClass(RuleWritable.class);
    job.setOutputValueClass(FeatureMap.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    return job;//from  www. j a v a 2 s .  co m
}

From source file:uk.ac.cam.eng.extraction.hadoop.merge.MergeJob.java

License:Apache License

public static Job getJob(Configuration conf) throws IOException {

    conf.set("mapred.map.child.java.opts", "-Xmx200m");
    conf.set("mapred.reduce.child.java.opts", "-Xmx10240m");

    Job job = new Job(conf);
    job.setJarByClass(MergeJob.class);
    job.setJobName("Merge");
    job.setSortComparatorClass(MergeComparator.class);
    job.setPartitionerClass(MergePartitioner.class);
    job.setReducerClass(MergeReducer.class);
    job.setCombinerClass(MergeCombiner.class);
    job.setMapOutputKeyClass(RuleWritable.class);
    job.setMapOutputValueClass(AlignmentAndFeatureMap.class);
    job.setOutputKeyClass(RuleWritable.class);
    job.setOutputValueClass(AlignmentAndFeatureMap.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SimpleHFileOutputFormat.class);
    return job;//  w  w  w . j a  v a2 s  .  c om
}