Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException 

Source Link

Document

Set the combiner class for the job.

Usage

From source file:fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.PairedEndFastqToTfq.java

License:LGPL

/**
 * Create the job to convert FASTQ files in a TFQ file.
 * @param parentConf Hadoop configuration
 * @param fastqFile1 Path of the first FASTQ file
 * @param fastqFile2 Path of the second FASTQ file
 * @param outputFile Path of the output TFQ file
 * @param reducerTaskCount the reducer task count
 * @return an Hadoop Job//from ww w  . jav a2  s. co  m
 * @throws IOException if an error occurs while creating the Job
 */
public static Job convert(final Configuration parentConf, final Path fastqFile1, final Path fastqFile2,
        final Path outputFile, final int reducerTaskCount) throws IOException {

    checkNotNull(parentConf, "parentConf argument cannot be null");
    checkNotNull(fastqFile1, "fastqFile1 argument cannot be null");
    checkNotNull(fastqFile2, "fastqFile2 argument cannot be null");
    checkNotNull(outputFile, "outputFile argument cannot be null");

    final Configuration jobConf = new Configuration(parentConf);

    // Set Job name
    // Create the job and its name
    final Job job = Job.getInstance(jobConf, "Convert FASTQ paired files in TFQ (" + fastqFile1.getName() + ", "
            + fastqFile2.getName() + ", " + outputFile.getName() + ")");

    // Set the jar
    job.setJarByClass(PairedEndFastqToTfq.class);

    // Set input path
    FileInputFormat.addInputPath(job, fastqFile1);
    FileInputFormat.addInputPath(job, fastqFile2);

    // Set the input format
    job.setInputFormatClass(FastqInputFormat.class);

    // Set the Reducer class
    job.setReducerClass(FastqPairedEndReducer.class);

    // Set the Combiner class
    job.setCombinerClass(FastqPairedEndReducer.class);

    // Set the output key class
    job.setOutputKeyClass(Text.class);

    // Set the output value class
    job.setOutputValueClass(Text.class);

    // Set the reducer task count
    if (reducerTaskCount > 0) {
        job.setNumReduceTasks(reducerTaskCount);
    }

    // Set output path
    FileOutputFormat.setOutputPath(job, outputFile);

    return job;
}

From source file:functionaltests.ext.mapreduce.TestMapReduce.java

License:Apache License

private Job prepareHadoopJob(boolean combiner) throws Throwable {

    helper.cleanup();//from   w  w  w.j  a va  2s.c o  m

    // generate input
    helper.writeFile("in/part1", INPUT1);
    helper.writeFile("in/part2", INPUT2);

    // create and configure Hadoop job
    Configuration conf = new Configuration();
    Job job = new Job(conf, "word count");
    job.setMapperClass(TokenizerMapper.class);
    if (combiner) {
        job.setCombinerClass(IntSumReducer.class);
    }
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(TextInputFormat.class);
    FileInputFormat.addInputPath(job, new Path("part1"));
    FileInputFormat.addInputPath(job, new Path("part2"));
    FileOutputFormat.setOutputPath(job, new Path("output"));

    return job;
}

From source file:gaffer.accumulostore.operation.hdfs.handler.job.factory.AccumuloAddElementsFromHdfsJobFactory.java

License:Apache License

private void setupCombiner(final Job job) throws IOException {
    job.setCombinerClass(AccumuloKeyValueReducer.class);
}

From source file:gaffer.analytic.impl.GraphStatistics.java

License:Apache License

public int run(String[] args) throws Exception {
    // Usage//from  w  w w .  java 2 s.  c  o m
    if (args.length != 6 && args.length != 7) {
        System.err.println(USAGE);
        return 1;
    }

    // Parse options
    Path outputPath = new Path(args[0]);
    String accumuloPropertiesFile = args[1];
    int numReduceTasks;
    try {
        numReduceTasks = Integer.parseInt(args[2]);
    } catch (NumberFormatException e) {
        System.err.println(USAGE);
        return 1;
    }
    Date startDate = null;
    Date endDate = null;
    boolean useTimeWindow = false;
    if (!args[3].equals("null") && !args[4].equals("null")) {
        try {
            startDate = DATE_FORMAT.parse(args[3]);
            endDate = DATE_FORMAT.parse(args[4]);
        } catch (ParseException e) {
            System.err.println("Error parsing dates: " + args[3] + " " + args[4] + " " + e.getMessage());
            return 1;
        }
        useTimeWindow = true;
    }
    boolean rollUpOverTimeAndVisibility = Boolean.parseBoolean(args[5]);
    boolean seedsSpecified = (args.length == 7);
    String seedsFile = "";
    if (seedsSpecified) {
        seedsFile = args[6];
    }

    // Hadoop configuration
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Connect to Accumulo, so we can check connection and check that the
    // table exists
    AccumuloConfig accConf = new AccumuloConfig(accumuloPropertiesFile);
    Connector conn = Accumulo.connect(accConf);
    String tableName = accConf.getTable();
    Authorizations authorizations = conn.securityOperations().getUserAuthorizations(accConf.getUserName());

    // Check if the table exists
    if (!conn.tableOperations().exists(tableName)) {
        System.err.println("Table " + tableName + " does not exist.");
        return 1;
    }

    // Create graph and update configuration based on the view
    AccumuloBackedGraph graph = new AccumuloBackedGraph(conn, tableName);
    if (useTimeWindow) {
        graph.setTimeWindow(startDate, endDate);
    }
    graph.rollUpOverTimeAndVisibility(rollUpOverTimeAndVisibility);
    if (seedsSpecified) {
        Set<TypeValue> typeValues = new HashSet<TypeValue>();
        BufferedReader reader = new BufferedReader(new FileReader(seedsFile));
        String line;
        while ((line = reader.readLine()) != null) {
            String[] tokens = line.split("\\|");
            if (tokens.length != 2) {
                System.err.println("Invalid line: " + line);
                continue;
            }
            String type = tokens[0];
            String value = tokens[1];
            typeValues.add(new TypeValue(type, value));
        }
        reader.close();
        graph.setConfiguration(conf, typeValues, accConf);
    } else {
        graph.setConfiguration(conf, accConf);
    }

    // Conf
    conf.setBoolean("mapred.compress.map.output", true);
    conf.setClass("mapred.map.output.compression.codec", SnappyCodec.class, CompressionCodec.class);

    // Job
    Job job = new Job(conf);
    job.setJarByClass(getClass());
    job.setJobName("Running MapReduce against Gaffer data in Accumulo: input = " + tableName + ", output = "
            + outputPath);

    // Input format - use BatchScannerElementInputFormat if seeds have been specified (as that creates fewer
    // splits); otherwise use ElementInputFormat which is based on the standard AccumuloInputFormat.
    if (seedsSpecified) {
        job.setInputFormatClass(BatchScannerElementInputFormat.class);
    } else {
        job.setInputFormatClass(ElementInputFormat.class);
    }

    // Mapper
    job.setMapperClass(GraphStatisticsMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(SetOfStatistics.class);

    // Combiner
    job.setCombinerClass(GraphStatisticsReducer.class);

    // Reducer
    job.setReducerClass(GraphStatisticsReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(SetOfStatistics.class);
    job.setNumReduceTasks(numReduceTasks);

    // Output
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, outputPath);

    System.out.println("Running MapReduce job over:");
    System.out.println("\tTable: " + accConf.getTable());
    System.out.println("\tUser: " + accConf.getUserName());
    System.out.println("\tAuths: " + authorizations);
    if (useTimeWindow) {
        System.out.println("\tFilter by time: start time is " + DATE_FORMAT.format(startDate) + ", "
                + DATE_FORMAT.format(endDate));
    } else {
        System.out.println("\tFilter by time is off");
    }
    System.out.println("\tRoll up over time and visibility: " + rollUpOverTimeAndVisibility);

    // Run job
    job.waitForCompletion(true);

    // Successful?
    if (!job.isSuccessful()) {
        System.err.println("Error running job");
        return 1;
    }

    // Write results out
    System.out.println("Summary of graph");
    for (FileStatus file : fs.listStatus(outputPath)) {
        if (!file.isDirectory() && !file.getPath().getName().contains("_SUCCESS")) {
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, file.getPath(), conf);
            Text text = new Text();
            SetOfStatistics stats = new SetOfStatistics();
            while (reader.next(text, stats)) {
                System.out.println(text + ", " + stats);
            }
            reader.close();
        }
    }

    return 0;
}

From source file:gov.llnl.ontology.mapreduce.stats.CompoundTokenCountMR.java

License:Open Source License

/**
 * Sets up the Reducer for this job.  /*w  w  w .j a v  a2s.c  o m*/
 */
protected void setupReducer(String tableName, Job job, MRArgOptions options) {
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(options.getPositionalArg(0)));
    job.setNumReduceTasks(24);
}

From source file:gov.llnl.ontology.mapreduce.stats.DependencyOccurrenceCountMR.java

License:Open Source License

/**
 * Sets up the Reducer for this job.  //w w  w .  j av a  2s.co m
 */
protected void setupReducer(String tableName, Job job, MRArgOptions options) {
    job.setCombinerClass(WordCountSumReducer.class);
    job.setReducerClass(WordCountSumReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(options.getPositionalArg(0)));
    job.setNumReduceTasks(24);
}

From source file:gov.llnl.ontology.mapreduce.stats.TagNetworkMR.java

License:Open Source License

/**
 * Sets up the Reducer for this job.  //w  w  w . jav  a  2 s. c  om
 */
protected void setupReducer(String tableName, Job job, MRArgOptions options) {
    job.setCombinerClass(WordSumReducer.class);
    job.setReducerClass(WordSumReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(options.getPositionalArg(0)));
    job.setNumReduceTasks(2);
}

From source file:gov.llnl.ontology.mapreduce.stats.WordnetShortestPathMR.java

License:Open Source License

/**
 * {@inheritDoc}/*from   ww  w  . j a v a 2s . c o  m*/
 */
public int run(String[] args) throws Exception {
    // Setup and valdiate the arguments.
    ArgOptions options = new ArgOptions();
    options.addOption('w', "wordnetDir", "The directory path to the wordnet data files", true, "PATH",
            "Required");

    options.parseOptions(args);
    if (!options.hasOption('w')) {
        System.err.println("usage: java WordnetShortestPathMR [OPTIONS] <outdir>\n" + options.prettyPrint());
    }

    // Open the wordnet reader and gather the set of all Synsets known by
    // the ontology.
    OntologyReader reader = WordNetCorpusReader.initialize(options.getStringOption('w'));
    Set<Synset> synsetSet = new HashSet<Synset>();
    for (String lemma : reader.wordnetTerms())
        for (Synset synset : reader.getSynsets(lemma))
            synsetSet.add(synset);

    // Compute each pairing of Synsets and write that pairing to a file in
    // HDFS.
    Synset[] synsets = synsetSet.toArray(new Synset[0]);
    PrintStream outStream = createPrintStream();
    for (int i = 0; i < synsets.length; ++i)
        for (int j = i + 1; j < synsets.length; ++j)
            outStream.printf("%s|%s\n", synsets[i].getName(), synsets[j].getName());
    outStream.close();

    // Store the wordnet directory information so that the mappers can load
    // it up.  They need it to figure out the shortest path information.
    Configuration conf = getConf();
    conf.set(WORDNET, options.getStringOption('w'));

    // Setup the job information.
    Job job = new Job(conf, "Compute Wordnet Shortest Paths");
    job.setJarByClass(WordnetShortestPathMR.class);

    job.setMapperClass(WordnetShortestPathMapper.class);

    // The input file will be the temporary file created with the synset
    // pairings.
    job.setInputFormatClass(LineDocInputFormat.class);
    FileInputFormat.addInputPath(job, new Path(TEMP_TERM_PAIR_PATH));

    // The mappers do all of the real work, so we just write their output
    // straight to disk.
    job.setCombinerClass(Reducer.class);
    job.setReducerClass(Reducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(options.getPositionalArg(0)));

    // Start the job.
    job.waitForCompletion(true);

    return 0;
}

From source file:gr.ntua.ece.cslab.modissense.queries.clients.GeneralHotIntQueryClient.java

@Override
public void executeQuery() {
    try {//  w  w w . j av a2 s  . c om
        if (this.createIfNotExist()) { //table exists            
            Configuration conf = HBaseConfiguration.create();
            Job job = new Job(conf, "Non personalized hotness interest");
            job.setJarByClass(GeneralHotIntQueryClient.class);
            Scan scan = new Scan();
            scan.setCaching(10000);

            scan.setFilter(new ColumnRangeFilter(Bytes.toBytes(startTimestamp), true,
                    Bytes.toBytes(endTimestamp), true));
            TableMapReduceUtil.initTableMapperJob(this.srcTable, // table name in bytes
                    scan, // scanner to use
                    GeneralHotIntQueryMapper.class, // mapper class
                    LongWritable.class, // key class
                    HotnessInterestWritable.class, // value class
                    job); // job object

            TableMapReduceUtil.initTableReducerJob(this.targetTable, GeneralHotIntQueryReducer.class, job);
            job.setPartitionerClass(HashPartitioner.class);
            job.setCombinerClass(GeneralHotIntQueryCombiner.class);
            job.setNumReduceTasks(4);
            job.setOutputFormatClass(TableOutputFormat.class);

            job.waitForCompletion(true);
        }
        this.openConnection(targetTable);
    } catch (IOException | InterruptedException | ClassNotFoundException ex) {
        Logger.getLogger(GeneralHotIntQueryClient.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:gr.ntua.h2rdf.byteImport.HexastoreBulkImport.java

License:Open Source License

public Job createSubmittableJob(String[] args) {
    TABLE_NAME = args[1];//from   w w w . j a va  2  s . c om
    Job job = null;
    try {
        Configuration conf = new Configuration();
        conf.addResource("hbase-default.xml");
        conf.addResource("hbase-site.xml");
        job = new Job(conf, NAME);
        job.setJarByClass(HexastoreBulkImport.class);
        job.setMapperClass(TotalOrderPrep.Map.class);
        job.setReducerClass(Reduce.class);//sampler.HamaReducer.class);
        job.setCombinerClass(Combiner.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(ImmutableBytesWritable.class);
        job.setPartitionerClass(TotalOrderPartitioner.class);
        //TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("/user/npapa/"+regions+"partitions/part-r-00000"));
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("partitions/part-r-00000"));
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(HFileOutputFormat.class);
        Path out = new Path("out");
        FileOutputFormat.setOutputPath(job, out);
        FileSystem fs;
        try {
            fs = FileSystem.get(conf);
            if (fs.exists(out)) {
                fs.delete(out, true);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }

        // c.addResource(new Path("/0/arcomemDB/hadoop-0.20.2-cdh3u3/conf/hbase-site.xml"));
        HBaseAdmin hadmin = new HBaseAdmin(conf);
        HTableDescriptor desc = new HTableDescriptor(TABLE_NAME + "_stats");
        HColumnDescriptor family = new HColumnDescriptor("size");
        desc.addFamily(family);
        conf.setInt("zookeeper.session.timeout", 600000);
        if (hadmin.tableExists(TABLE_NAME + "_stats")) {
            //hadmin.disableTable(TABLE_NAME+"_stats");
            //hadmin.deleteTable(TABLE_NAME+"_stats");
        } else {
            hadmin.createTable(desc);
        }

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        //job.getConfiguration().setInt("mapred.map.tasks", 18);
        job.getConfiguration().set("h2rdf.tableName", TABLE_NAME);
        job.getConfiguration().setInt("mapred.reduce.tasks", (int) TotalOrderPrep.regions);
        job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
        job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
        job.getConfiguration().setInt("io.sort.mb", 100);
        job.getConfiguration().setInt("io.file.buffer.size", 131072);
        job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1);
        //job.getConfiguration().setInt("hbase.hregion.max.filesize", 67108864);
        job.getConfiguration().setInt("hbase.hregion.max.filesize", 33554432);
        //job.getConfiguration().setInt("io.sort.mb", 100);

    } catch (IOException e2) {
        e2.printStackTrace();
    }

    return job;
}