List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.PairedEndFastqToTfq.java
License:LGPL
/** * Create the job to convert FASTQ files in a TFQ file. * @param parentConf Hadoop configuration * @param fastqFile1 Path of the first FASTQ file * @param fastqFile2 Path of the second FASTQ file * @param outputFile Path of the output TFQ file * @param reducerTaskCount the reducer task count * @return an Hadoop Job//from ww w . jav a2 s. co m * @throws IOException if an error occurs while creating the Job */ public static Job convert(final Configuration parentConf, final Path fastqFile1, final Path fastqFile2, final Path outputFile, final int reducerTaskCount) throws IOException { checkNotNull(parentConf, "parentConf argument cannot be null"); checkNotNull(fastqFile1, "fastqFile1 argument cannot be null"); checkNotNull(fastqFile2, "fastqFile2 argument cannot be null"); checkNotNull(outputFile, "outputFile argument cannot be null"); final Configuration jobConf = new Configuration(parentConf); // Set Job name // Create the job and its name final Job job = Job.getInstance(jobConf, "Convert FASTQ paired files in TFQ (" + fastqFile1.getName() + ", " + fastqFile2.getName() + ", " + outputFile.getName() + ")"); // Set the jar job.setJarByClass(PairedEndFastqToTfq.class); // Set input path FileInputFormat.addInputPath(job, fastqFile1); FileInputFormat.addInputPath(job, fastqFile2); // Set the input format job.setInputFormatClass(FastqInputFormat.class); // Set the Reducer class job.setReducerClass(FastqPairedEndReducer.class); // Set the Combiner class job.setCombinerClass(FastqPairedEndReducer.class); // Set the output key class job.setOutputKeyClass(Text.class); // Set the output value class job.setOutputValueClass(Text.class); // Set the reducer task count if (reducerTaskCount > 0) { job.setNumReduceTasks(reducerTaskCount); } // Set output path FileOutputFormat.setOutputPath(job, outputFile); return job; }
From source file:functionaltests.ext.mapreduce.TestMapReduce.java
License:Apache License
private Job prepareHadoopJob(boolean combiner) throws Throwable { helper.cleanup();//from w w w.j a va 2s.c o m // generate input helper.writeFile("in/part1", INPUT1); helper.writeFile("in/part2", INPUT2); // create and configure Hadoop job Configuration conf = new Configuration(); Job job = new Job(conf, "word count"); job.setMapperClass(TokenizerMapper.class); if (combiner) { job.setCombinerClass(IntSumReducer.class); } job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.addInputPath(job, new Path("part1")); FileInputFormat.addInputPath(job, new Path("part2")); FileOutputFormat.setOutputPath(job, new Path("output")); return job; }
From source file:gaffer.accumulostore.operation.hdfs.handler.job.factory.AccumuloAddElementsFromHdfsJobFactory.java
License:Apache License
private void setupCombiner(final Job job) throws IOException { job.setCombinerClass(AccumuloKeyValueReducer.class); }
From source file:gaffer.analytic.impl.GraphStatistics.java
License:Apache License
public int run(String[] args) throws Exception { // Usage//from w w w . java 2 s. c o m if (args.length != 6 && args.length != 7) { System.err.println(USAGE); return 1; } // Parse options Path outputPath = new Path(args[0]); String accumuloPropertiesFile = args[1]; int numReduceTasks; try { numReduceTasks = Integer.parseInt(args[2]); } catch (NumberFormatException e) { System.err.println(USAGE); return 1; } Date startDate = null; Date endDate = null; boolean useTimeWindow = false; if (!args[3].equals("null") && !args[4].equals("null")) { try { startDate = DATE_FORMAT.parse(args[3]); endDate = DATE_FORMAT.parse(args[4]); } catch (ParseException e) { System.err.println("Error parsing dates: " + args[3] + " " + args[4] + " " + e.getMessage()); return 1; } useTimeWindow = true; } boolean rollUpOverTimeAndVisibility = Boolean.parseBoolean(args[5]); boolean seedsSpecified = (args.length == 7); String seedsFile = ""; if (seedsSpecified) { seedsFile = args[6]; } // Hadoop configuration Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Connect to Accumulo, so we can check connection and check that the // table exists AccumuloConfig accConf = new AccumuloConfig(accumuloPropertiesFile); Connector conn = Accumulo.connect(accConf); String tableName = accConf.getTable(); Authorizations authorizations = conn.securityOperations().getUserAuthorizations(accConf.getUserName()); // Check if the table exists if (!conn.tableOperations().exists(tableName)) { System.err.println("Table " + tableName + " does not exist."); return 1; } // Create graph and update configuration based on the view AccumuloBackedGraph graph = new AccumuloBackedGraph(conn, tableName); if (useTimeWindow) { graph.setTimeWindow(startDate, endDate); } graph.rollUpOverTimeAndVisibility(rollUpOverTimeAndVisibility); if (seedsSpecified) { Set<TypeValue> typeValues = new HashSet<TypeValue>(); BufferedReader reader = new BufferedReader(new FileReader(seedsFile)); String line; while ((line = reader.readLine()) != null) { String[] tokens = line.split("\\|"); if (tokens.length != 2) { System.err.println("Invalid line: " + line); continue; } String type = tokens[0]; String value = tokens[1]; typeValues.add(new TypeValue(type, value)); } reader.close(); graph.setConfiguration(conf, typeValues, accConf); } else { graph.setConfiguration(conf, accConf); } // Conf conf.setBoolean("mapred.compress.map.output", true); conf.setClass("mapred.map.output.compression.codec", SnappyCodec.class, CompressionCodec.class); // Job Job job = new Job(conf); job.setJarByClass(getClass()); job.setJobName("Running MapReduce against Gaffer data in Accumulo: input = " + tableName + ", output = " + outputPath); // Input format - use BatchScannerElementInputFormat if seeds have been specified (as that creates fewer // splits); otherwise use ElementInputFormat which is based on the standard AccumuloInputFormat. if (seedsSpecified) { job.setInputFormatClass(BatchScannerElementInputFormat.class); } else { job.setInputFormatClass(ElementInputFormat.class); } // Mapper job.setMapperClass(GraphStatisticsMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(SetOfStatistics.class); // Combiner job.setCombinerClass(GraphStatisticsReducer.class); // Reducer job.setReducerClass(GraphStatisticsReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(SetOfStatistics.class); job.setNumReduceTasks(numReduceTasks); // Output job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, outputPath); System.out.println("Running MapReduce job over:"); System.out.println("\tTable: " + accConf.getTable()); System.out.println("\tUser: " + accConf.getUserName()); System.out.println("\tAuths: " + authorizations); if (useTimeWindow) { System.out.println("\tFilter by time: start time is " + DATE_FORMAT.format(startDate) + ", " + DATE_FORMAT.format(endDate)); } else { System.out.println("\tFilter by time is off"); } System.out.println("\tRoll up over time and visibility: " + rollUpOverTimeAndVisibility); // Run job job.waitForCompletion(true); // Successful? if (!job.isSuccessful()) { System.err.println("Error running job"); return 1; } // Write results out System.out.println("Summary of graph"); for (FileStatus file : fs.listStatus(outputPath)) { if (!file.isDirectory() && !file.getPath().getName().contains("_SUCCESS")) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, file.getPath(), conf); Text text = new Text(); SetOfStatistics stats = new SetOfStatistics(); while (reader.next(text, stats)) { System.out.println(text + ", " + stats); } reader.close(); } } return 0; }
From source file:gov.llnl.ontology.mapreduce.stats.CompoundTokenCountMR.java
License:Open Source License
/** * Sets up the Reducer for this job. /*w w w .j a v a2s.c o m*/ */ protected void setupReducer(String tableName, Job job, MRArgOptions options) { job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(options.getPositionalArg(0))); job.setNumReduceTasks(24); }
From source file:gov.llnl.ontology.mapreduce.stats.DependencyOccurrenceCountMR.java
License:Open Source License
/** * Sets up the Reducer for this job. //w w w . j av a 2s.co m */ protected void setupReducer(String tableName, Job job, MRArgOptions options) { job.setCombinerClass(WordCountSumReducer.class); job.setReducerClass(WordCountSumReducer.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(options.getPositionalArg(0))); job.setNumReduceTasks(24); }
From source file:gov.llnl.ontology.mapreduce.stats.TagNetworkMR.java
License:Open Source License
/** * Sets up the Reducer for this job. //w w w . jav a 2 s. c om */ protected void setupReducer(String tableName, Job job, MRArgOptions options) { job.setCombinerClass(WordSumReducer.class); job.setReducerClass(WordSumReducer.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(options.getPositionalArg(0))); job.setNumReduceTasks(2); }
From source file:gov.llnl.ontology.mapreduce.stats.WordnetShortestPathMR.java
License:Open Source License
/** * {@inheritDoc}/*from ww w . j a v a 2s . c o m*/ */ public int run(String[] args) throws Exception { // Setup and valdiate the arguments. ArgOptions options = new ArgOptions(); options.addOption('w', "wordnetDir", "The directory path to the wordnet data files", true, "PATH", "Required"); options.parseOptions(args); if (!options.hasOption('w')) { System.err.println("usage: java WordnetShortestPathMR [OPTIONS] <outdir>\n" + options.prettyPrint()); } // Open the wordnet reader and gather the set of all Synsets known by // the ontology. OntologyReader reader = WordNetCorpusReader.initialize(options.getStringOption('w')); Set<Synset> synsetSet = new HashSet<Synset>(); for (String lemma : reader.wordnetTerms()) for (Synset synset : reader.getSynsets(lemma)) synsetSet.add(synset); // Compute each pairing of Synsets and write that pairing to a file in // HDFS. Synset[] synsets = synsetSet.toArray(new Synset[0]); PrintStream outStream = createPrintStream(); for (int i = 0; i < synsets.length; ++i) for (int j = i + 1; j < synsets.length; ++j) outStream.printf("%s|%s\n", synsets[i].getName(), synsets[j].getName()); outStream.close(); // Store the wordnet directory information so that the mappers can load // it up. They need it to figure out the shortest path information. Configuration conf = getConf(); conf.set(WORDNET, options.getStringOption('w')); // Setup the job information. Job job = new Job(conf, "Compute Wordnet Shortest Paths"); job.setJarByClass(WordnetShortestPathMR.class); job.setMapperClass(WordnetShortestPathMapper.class); // The input file will be the temporary file created with the synset // pairings. job.setInputFormatClass(LineDocInputFormat.class); FileInputFormat.addInputPath(job, new Path(TEMP_TERM_PAIR_PATH)); // The mappers do all of the real work, so we just write their output // straight to disk. job.setCombinerClass(Reducer.class); job.setReducerClass(Reducer.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(options.getPositionalArg(0))); // Start the job. job.waitForCompletion(true); return 0; }
From source file:gr.ntua.ece.cslab.modissense.queries.clients.GeneralHotIntQueryClient.java
@Override public void executeQuery() { try {// w w w . j av a2 s . c om if (this.createIfNotExist()) { //table exists Configuration conf = HBaseConfiguration.create(); Job job = new Job(conf, "Non personalized hotness interest"); job.setJarByClass(GeneralHotIntQueryClient.class); Scan scan = new Scan(); scan.setCaching(10000); scan.setFilter(new ColumnRangeFilter(Bytes.toBytes(startTimestamp), true, Bytes.toBytes(endTimestamp), true)); TableMapReduceUtil.initTableMapperJob(this.srcTable, // table name in bytes scan, // scanner to use GeneralHotIntQueryMapper.class, // mapper class LongWritable.class, // key class HotnessInterestWritable.class, // value class job); // job object TableMapReduceUtil.initTableReducerJob(this.targetTable, GeneralHotIntQueryReducer.class, job); job.setPartitionerClass(HashPartitioner.class); job.setCombinerClass(GeneralHotIntQueryCombiner.class); job.setNumReduceTasks(4); job.setOutputFormatClass(TableOutputFormat.class); job.waitForCompletion(true); } this.openConnection(targetTable); } catch (IOException | InterruptedException | ClassNotFoundException ex) { Logger.getLogger(GeneralHotIntQueryClient.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:gr.ntua.h2rdf.byteImport.HexastoreBulkImport.java
License:Open Source License
public Job createSubmittableJob(String[] args) { TABLE_NAME = args[1];//from w w w . j a va 2 s . c om Job job = null; try { Configuration conf = new Configuration(); conf.addResource("hbase-default.xml"); conf.addResource("hbase-site.xml"); job = new Job(conf, NAME); job.setJarByClass(HexastoreBulkImport.class); job.setMapperClass(TotalOrderPrep.Map.class); job.setReducerClass(Reduce.class);//sampler.HamaReducer.class); job.setCombinerClass(Combiner.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(ImmutableBytesWritable.class); job.setPartitionerClass(TotalOrderPartitioner.class); //TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("/user/npapa/"+regions+"partitions/part-r-00000")); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("partitions/part-r-00000")); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(HFileOutputFormat.class); Path out = new Path("out"); FileOutputFormat.setOutputPath(job, out); FileSystem fs; try { fs = FileSystem.get(conf); if (fs.exists(out)) { fs.delete(out, true); } } catch (IOException e) { e.printStackTrace(); } // c.addResource(new Path("/0/arcomemDB/hadoop-0.20.2-cdh3u3/conf/hbase-site.xml")); HBaseAdmin hadmin = new HBaseAdmin(conf); HTableDescriptor desc = new HTableDescriptor(TABLE_NAME + "_stats"); HColumnDescriptor family = new HColumnDescriptor("size"); desc.addFamily(family); conf.setInt("zookeeper.session.timeout", 600000); if (hadmin.tableExists(TABLE_NAME + "_stats")) { //hadmin.disableTable(TABLE_NAME+"_stats"); //hadmin.deleteTable(TABLE_NAME+"_stats"); } else { hadmin.createTable(desc); } FileInputFormat.setInputPaths(job, new Path(args[0])); //job.getConfiguration().setInt("mapred.map.tasks", 18); job.getConfiguration().set("h2rdf.tableName", TABLE_NAME); job.getConfiguration().setInt("mapred.reduce.tasks", (int) TotalOrderPrep.regions); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().setInt("io.sort.mb", 100); job.getConfiguration().setInt("io.file.buffer.size", 131072); job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1); //job.getConfiguration().setInt("hbase.hregion.max.filesize", 67108864); job.getConfiguration().setInt("hbase.hregion.max.filesize", 33554432); //job.getConfiguration().setInt("io.sort.mb", 100); } catch (IOException e2) { e2.printStackTrace(); } return job; }