Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException

Source Link

Document

Set the combiner class for the job.

Usage

From source file:fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.PairedEndFastqToTfq.java

License:LGPL

/**
 * Create the job to convert FASTQ files in a TFQ file.
 * @param parentConf Hadoop configuration
 * @param fastqFile1 Path of the first FASTQ file
 * @param fastqFile2 Path of the second FASTQ file
 * @param outputFile Path of the output TFQ file
 * @param reducerTaskCount the reducer task count
 * @return an Hadoop Job//from ww w  . jav a2  s. co  m
 * @throws IOException if an error occurs while creating the Job
 */
public static Job convert(final Configuration parentConf, final Path fastqFile1, final Path fastqFile2,
        final Path outputFile, final int reducerTaskCount) throws IOException {

    checkNotNull(parentConf, "parentConf argument cannot be null");
    checkNotNull(fastqFile1, "fastqFile1 argument cannot be null");
    checkNotNull(fastqFile2, "fastqFile2 argument cannot be null");
    checkNotNull(outputFile, "outputFile argument cannot be null");

    final Configuration jobConf = new Configuration(parentConf);

    // Set Job name
    // Create the job and its name
    final Job job = Job.getInstance(jobConf, "Convert FASTQ paired files in TFQ (" + fastqFile1.getName() + ", "
            + fastqFile2.getName() + ", " + outputFile.getName() + ")");

    // Set the jar
    job.setJarByClass(PairedEndFastqToTfq.class);

    // Set input path
    FileInputFormat.addInputPath(job, fastqFile1);
    FileInputFormat.addInputPath(job, fastqFile2);

    // Set the input format
    job.setInputFormatClass(FastqInputFormat.class);

    // Set the Reducer class
    job.setReducerClass(FastqPairedEndReducer.class);

    // Set the Combiner class
    job.setCombinerClass(FastqPairedEndReducer.class);

    // Set the output key class
    job.setOutputKeyClass(Text.class);

    // Set the output value class
    job.setOutputValueClass(Text.class);

    // Set the reducer task count
    if (reducerTaskCount > 0) {
        job.setNumReduceTasks(reducerTaskCount);
    }

    // Set output path
    FileOutputFormat.setOutputPath(job, outputFile);

    return job;
}

From source file:functionaltests.ext.mapreduce.TestMapReduce.java

License:Apache License

private Job prepareHadoopJob(boolean combiner) throws Throwable {

    helper.cleanup();//from   w  w  w.j  a va  2s.c o  m

    // generate input
    helper.writeFile("in/part1", INPUT1);
    helper.writeFile("in/part2", INPUT2);

    // create and configure Hadoop job
    Configuration conf = new Configuration();
    Job job = new Job(conf, "word count");
    job.setMapperClass(TokenizerMapper.class);
    if (combiner) {
        job.setCombinerClass(IntSumReducer.class);
    }
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(TextInputFormat.class);
    FileInputFormat.addInputPath(job, new Path("part1"));
    FileInputFormat.addInputPath(job, new Path("part2"));
    FileOutputFormat.setOutputPath(job, new Path("output"));

    return job;
}

From source file:gaffer.accumulostore.operation.hdfs.handler.job.factory.AccumuloAddElementsFromHdfsJobFactory.java

License:Apache License

private void setupCombiner(final Job job) throws IOException {
    job.setCombinerClass(AccumuloKeyValueReducer.class);
}

From source file:gaffer.analytic.impl.GraphStatistics.java

License:Apache License

public int run(String[] args) throws Exception {
    // Usage//from  w  w w .  java 2 s.  c  o m
    if (args.length != 6 && args.length != 7) {
        System.err.println(USAGE);
        return 1;
    }

    // Parse options
    Path outputPath = new Path(args[0]);
    String accumuloPropertiesFile = args[1];
    int numReduceTasks;
    try {
        numReduceTasks = Integer.parseInt(args[2]);
    } catch (NumberFormatException e) {
        System.err.println(USAGE);
        return 1;
    }
    Date startDate = null;
    Date endDate = null;
    boolean useTimeWindow = false;
    if (!args[3].equals("null") && !args[4].equals("null")) {
        try {
            startDate = DATE_FORMAT.parse(args[3]);
            endDate = DATE_FORMAT.parse(args[4]);
        } catch (ParseException e) {
            System.err.println("Error parsing dates: " + args[3] + " " + args[4] + " " + e.getMessage());
            return 1;
        }
        useTimeWindow = true;
    }
    boolean rollUpOverTimeAndVisibility = Boolean.parseBoolean(args[5]);
    boolean seedsSpecified = (args.length == 7);
    String seedsFile = "";
    if (seedsSpecified) {
        seedsFile = args[6];
    }

    // Hadoop configuration
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Connect to Accumulo, so we can check connection and check that the
    // table exists
    AccumuloConfig accConf = new AccumuloConfig(accumuloPropertiesFile);
    Connector conn = Accumulo.connect(accConf);
    String tableName = accConf.getTable();
    Authorizations authorizations = conn.securityOperations().getUserAuthorizations(accConf.getUserName());

    // Check if the table exists
    if (!conn.tableOperations().exists(tableName)) {
        System.err.println("Table " + tableName + " does not exist.");
        return 1;
    }

    // Create graph and update configuration based on the view
    AccumuloBackedGraph graph = new AccumuloBackedGraph(conn, tableName);
    if (useTimeWindow) {
        graph.setTimeWindow(startDate, endDate);
    }
    graph.rollUpOverTimeAndVisibility(rollUpOverTimeAndVisibility);
    if (seedsSpecified) {
        Set<TypeValue> typeValues = new HashSet<TypeValue>();
        BufferedReader reader = new BufferedReader(new FileReader(seedsFile));
        String line;
        while ((line = reader.readLine()) != null) {
            String[] tokens = line.split("\\|");
            if (tokens.length != 2) {
                System.err.println("Invalid line: " + line);
                continue;
            }
            String type = tokens[0];
            String value = tokens[1];
            typeValues.add(new TypeValue(type, value));
        }
        reader.close();
        graph.setConfiguration(conf, typeValues, accConf);
    } else {
        graph.setConfiguration(conf, accConf);
    }

    // Conf
    conf.setBoolean("mapred.compress.map.output", true);
    conf.setClass("mapred.map.output.compression.codec", SnappyCodec.class, CompressionCodec.class);

    // Job
    Job job = new Job(conf);
    job.setJarByClass(getClass());
    job.setJobName("Running MapReduce against Gaffer data in Accumulo: input = " + tableName + ", output = "
            + outputPath);

    // Input format - use BatchScannerElementInputFormat if seeds have been specified (as that creates fewer
    // splits); otherwise use ElementInputFormat which is based on the standard AccumuloInputFormat.
    if (seedsSpecified) {
        job.setInputFormatClass(BatchScannerElementInputFormat.class);
    } else {
        job.setInputFormatClass(ElementInputFormat.class);
    }

    // Mapper
    job.setMapperClass(GraphStatisticsMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(SetOfStatistics.class);

    // Combiner
    job.setCombinerClass(GraphStatisticsReducer.class);

    // Reducer
    job.setReducerClass(GraphStatisticsReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(SetOfStatistics.class);
    job.setNumReduceTasks(numReduceTasks);

    // Output
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, outputPath);

    System.out.println("Running MapReduce job over:");
    System.out.println("\tTable: " + accConf.getTable());
    System.out.println("\tUser: " + accConf.getUserName());
    System.out.println("\tAuths: " + authorizations);
    if (useTimeWindow) {
        System.out.println("\tFilter by time: start time is " + DATE_FORMAT.format(startDate) + ", "
                + DATE_FORMAT.format(endDate));
    } else {
        System.out.println("\tFilter by time is off");
    }
    System.out.println("\tRoll up over time and visibility: " + rollUpOverTimeAndVisibility);

    // Run job
    job.waitForCompletion(true);

    // Successful?
    if (!job.isSuccessful()) {
        System.err.println("Error running job");
        return 1;
    }

    // Write results out
    System.out.println("Summary of graph");
    for (FileStatus file : fs.listStatus(outputPath)) {
        if (!file.isDirectory() && !file.getPath().getName().contains("_SUCCESS")) {
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, file.getPath(), conf);
            Text text = new Text();
            SetOfStatistics stats = new SetOfStatistics();
            while (reader.next(text, stats)) {
                System.out.println(text + ", " + stats);
            }
            reader.close();
        }
    }

    return 0;
}

From source file:gov.llnl.ontology.mapreduce.stats.CompoundTokenCountMR.java

License:Open Source License

/**
 * Sets up the Reducer for this job.  /*w  w  w .j a v  a2s.c  o m*/
 */
protected void setupReducer(String tableName, Job job, MRArgOptions options) {
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(options.getPositionalArg(0)));
    job.setNumReduceTasks(24);
}

From source file:gov.llnl.ontology.mapreduce.stats.DependencyOccurrenceCountMR.java

License:Open Source License

/**
 * Sets up the Reducer for this job.  //w w  w .  j av a  2s.co m
 */
protected void setupReducer(String tableName, Job job, MRArgOptions options) {
    job.setCombinerClass(WordCountSumReducer.class);
    job.setReducerClass(WordCountSumReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(options.getPositionalArg(0)));
    job.setNumReduceTasks(24);
}

From source file:gov.llnl.ontology.mapreduce.stats.TagNetworkMR.java

License:Open Source License

/**
 * Sets up the Reducer for this job.  //w  w  w . jav  a  2 s. c  om
 */
protected void setupReducer(String tableName, Job job, MRArgOptions options) {
    job.setCombinerClass(WordSumReducer.class);
    job.setReducerClass(WordSumReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(options.getPositionalArg(0)));
    job.setNumReduceTasks(2);
}

From source file:gov.llnl.ontology.mapreduce.stats.WordnetShortestPathMR.java

License:Open Source License

/**
 * {@inheritDoc}/*from   ww  w  . j a v a 2s . c o  m*/
 */
public int run(String[] args) throws Exception {
    // Setup and valdiate the arguments.
    ArgOptions options = new ArgOptions();
    options.addOption('w', "wordnetDir", "The directory path to the wordnet data files", true, "PATH",
            "Required");

    options.parseOptions(args);
    if (!options.hasOption('w')) {
        System.err.println("usage: java WordnetShortestPathMR [OPTIONS] <outdir>\n" + options.prettyPrint());
    }

    // Open the wordnet reader and gather the set of all Synsets known by
    // the ontology.
    OntologyReader reader = WordNetCorpusReader.initialize(options.getStringOption('w'));
    Set<Synset> synsetSet = new HashSet<Synset>();
    for (String lemma : reader.wordnetTerms())
        for (Synset synset : reader.getSynsets(lemma))
            synsetSet.add(synset);

    // Compute each pairing of Synsets and write that pairing to a file in
    // HDFS.
    Synset[] synsets = synsetSet.toArray(new Synset[0]);
    PrintStream outStream = createPrintStream();
    for (int i = 0; i < synsets.length; ++i)
        for (int j = i + 1; j < synsets.length; ++j)
            outStream.printf("%s|%s\n", synsets[i].getName(), synsets[j].getName());
    outStream.close();

    // Store the wordnet directory information so that the mappers can load
    // it up.  They need it to figure out the shortest path information.
    Configuration conf = getConf();
    conf.set(WORDNET, options.getStringOption('w'));

    // Setup the job information.
    Job job = new Job(conf, "Compute Wordnet Shortest Paths");
    job.setJarByClass(WordnetShortestPathMR.class);

    job.setMapperClass(WordnetShortestPathMapper.class);

    // The input file will be the temporary file created with the synset
    // pairings.
    job.setInputFormatClass(LineDocInputFormat.class);
    FileInputFormat.addInputPath(job, new Path(TEMP_TERM_PAIR_PATH));

    // The mappers do all of the real work, so we just write their output
    // straight to disk.
    job.setCombinerClass(Reducer.class);
    job.setReducerClass(Reducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(options.getPositionalArg(0)));

    // Start the job.
    job.waitForCompletion(true);

    return 0;
}

From source file:gr.ntua.ece.cslab.modissense.queries.clients.GeneralHotIntQueryClient.java

@Override
public void executeQuery() {
    try {//  w  w w . j av a2 s  . c om
        if (this.createIfNotExist()) { //table exists            
            Configuration conf = HBaseConfiguration.create();
            Job job = new Job(conf, "Non personalized hotness interest");
            job.setJarByClass(GeneralHotIntQueryClient.class);
            Scan scan = new Scan();
            scan.setCaching(10000);

            scan.setFilter(new ColumnRangeFilter(Bytes.toBytes(startTimestamp), true,
                    Bytes.toBytes(endTimestamp), true));
            TableMapReduceUtil.initTableMapperJob(this.srcTable, // table name in bytes
                    scan, // scanner to use
                    GeneralHotIntQueryMapper.class, // mapper class
                    LongWritable.class, // key class
                    HotnessInterestWritable.class, // value class
                    job); // job object

            TableMapReduceUtil.initTableReducerJob(this.targetTable, GeneralHotIntQueryReducer.class, job);
            job.setPartitionerClass(HashPartitioner.class);
            job.setCombinerClass(GeneralHotIntQueryCombiner.class);
            job.setNumReduceTasks(4);
            job.setOutputFormatClass(TableOutputFormat.class);

            job.waitForCompletion(true);
        }
        this.openConnection(targetTable);
    } catch (IOException | InterruptedException | ClassNotFoundException ex) {
        Logger.getLogger(GeneralHotIntQueryClient.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:gr.ntua.h2rdf.byteImport.HexastoreBulkImport.java

License:Open Source License

public Job createSubmittableJob(String[] args) {
    TABLE_NAME = args[1];//from   w w w . j a va  2  s . c om
    Job job = null;
    try {
        Configuration conf = new Configuration();
        conf.addResource("hbase-default.xml");
        conf.addResource("hbase-site.xml");
        job = new Job(conf, NAME);
        job.setJarByClass(HexastoreBulkImport.class);
        job.setMapperClass(TotalOrderPrep.Map.class);
        job.setReducerClass(Reduce.class);//sampler.HamaReducer.class);
        job.setCombinerClass(Combiner.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(ImmutableBytesWritable.class);
        job.setPartitionerClass(TotalOrderPartitioner.class);
        //TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("/user/npapa/"+regions+"partitions/part-r-00000"));
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("partitions/part-r-00000"));
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(HFileOutputFormat.class);
        Path out = new Path("out");
        FileOutputFormat.setOutputPath(job, out);
        FileSystem fs;
        try {
            fs = FileSystem.get(conf);
            if (fs.exists(out)) {
                fs.delete(out, true);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }

        // c.addResource(new Path("/0/arcomemDB/hadoop-0.20.2-cdh3u3/conf/hbase-site.xml"));
        HBaseAdmin hadmin = new HBaseAdmin(conf);
        HTableDescriptor desc = new HTableDescriptor(TABLE_NAME + "_stats");
        HColumnDescriptor family = new HColumnDescriptor("size");
        desc.addFamily(family);
        conf.setInt("zookeeper.session.timeout", 600000);
        if (hadmin.tableExists(TABLE_NAME + "_stats")) {
            //hadmin.disableTable(TABLE_NAME+"_stats");
            //hadmin.deleteTable(TABLE_NAME+"_stats");
        } else {
            hadmin.createTable(desc);
        }

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        //job.getConfiguration().setInt("mapred.map.tasks", 18);
        job.getConfiguration().set("h2rdf.tableName", TABLE_NAME);
        job.getConfiguration().setInt("mapred.reduce.tasks", (int) TotalOrderPrep.regions);
        job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
        job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
        job.getConfiguration().setInt("io.sort.mb", 100);
        job.getConfiguration().setInt("io.file.buffer.size", 131072);
        job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1);
        //job.getConfiguration().setInt("hbase.hregion.max.filesize", 67108864);
        job.getConfiguration().setInt("hbase.hregion.max.filesize", 33554432);
        //job.getConfiguration().setInt("io.sort.mb", 100);

    } catch (IOException e2) {
        e2.printStackTrace();
    }

    return job;
}