List of usage examples for org.apache.hadoop.conf Configuration setBoolean
public void setBoolean(String name, boolean value)
name
property to a boolean
. From source file:edu.umn.cs.spatialHadoop.operations.Indexer.java
License:Open Source License
/*** * Create a partitioner for a particular job * @param in//w w w. jav a2 s .co m * @param out * @param job * @param partitionerName * @return * @throws IOException */ public static Partitioner createPartitioner(Path[] ins, Path out, Configuration job, String partitionerName) throws IOException { try { Partitioner partitioner = null; Class<? extends Partitioner> partitionerClass = PartitionerClasses.get(partitionerName.toLowerCase()); if (partitionerClass == null) { // Try to parse the name as a class name try { partitionerClass = Class.forName(partitionerName).asSubclass(Partitioner.class); } catch (ClassNotFoundException e) { throw new RuntimeException("Unknown index type '" + partitionerName + "'"); } } if (PartitionerReplicate.containsKey(partitionerName.toLowerCase())) { boolean replicate = PartitionerReplicate.get(partitionerName.toLowerCase()); job.setBoolean("replicate", replicate); } partitioner = partitionerClass.newInstance(); long t1 = System.currentTimeMillis(); final Rectangle inMBR = (Rectangle) OperationsParams.getShape(job, "mbr"); // Determine number of partitions long inSize = 0; for (Path in : ins) { inSize += FileUtil.getPathSize(in.getFileSystem(job), in); } long estimatedOutSize = (long) (inSize * (1.0 + job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.1f))); FileSystem outFS = out.getFileSystem(job); long outBlockSize = outFS.getDefaultBlockSize(out); int numPartitions = Math.max(1, (int) Math.ceil((float) estimatedOutSize / outBlockSize)); LOG.info("Partitioning the space into " + numPartitions + " partitions"); final Vector<Point> sample = new Vector<Point>(); float sample_ratio = job.getFloat(SpatialSite.SAMPLE_RATIO, 0.01f); long sample_size = job.getLong(SpatialSite.SAMPLE_SIZE, 100 * 1024 * 1024); LOG.info("Reading a sample of " + (int) Math.round(sample_ratio * 100) + "%"); ResultCollector<Point> resultCollector = new ResultCollector<Point>() { @Override public void collect(Point p) { sample.add(p.clone()); } }; OperationsParams params2 = new OperationsParams(job); params2.setFloat("ratio", sample_ratio); params2.setLong("size", sample_size); params2.setClass("outshape", Point.class, Shape.class); Sampler.sample(ins, resultCollector, params2); long t2 = System.currentTimeMillis(); System.out.println("Total time for sampling in millis: " + (t2 - t1)); LOG.info("Finished reading a sample of " + sample.size() + " records"); partitioner.createFromPoints(inMBR, sample.toArray(new Point[sample.size()]), numPartitions); return partitioner; } catch (InstantiationException e) { e.printStackTrace(); return null; } catch (IllegalAccessException e) { e.printStackTrace(); return null; } }
From source file:edu.usc.pgroup.louvain.hadoop.LouvainMR.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); int displayLevel = Integer.parseInt(args[2]); boolean v = false; if (args.length > 3) { v = Boolean.parseBoolean(args[3]); }// w ww .j ava 2s .c o m conf.setInt(DISPLAY_LEVEL, displayLevel); conf.setBoolean(VERBOSE, v); conf.set(OUT_PATH, args[1]); Job job = new Job(conf); job.setJobName(TestJob.class.getName()); job.setJarByClass(TestJob.class); job.setMapperClass(MapCommunity.class); job.setReducerClass(ReduceCommunity.class); // Hello there ZipFileInputFormat! job.setInputFormatClass(GraphInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setMapOutputValueClass(BytesWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); TextOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); }
From source file:etl.cmd.test.XFsTestCase.java
License:Apache License
/** * Set up the testcase.// ww w. ja va 2s .c om * * @throws Exception thrown if the test case could no be set up. */ protected void setUp() throws Exception { super.setUp(); Configuration conf = new XConfiguration(); conf.setBoolean("oozie.service.HadoopAccessorService.kerberos.enabled", System.getProperty("oozie.test.hadoop.security", "simple").equals("kerberos")); conf.set("oozie.service.HadoopAccessorService.keytab.file", getKeytabFile()); conf.set("oozie.service.HadoopAccessorService.kerberos.principal", getOoziePrincipal()); conf.set("local.realm", getRealm()); conf.set("oozie.service.HadoopAccessorService.hadoop.configurations", "*=hadoop-conf"); conf.set("oozie.service.HadoopAccessorService.action.configurations", "*=action-conf"); has = new HadoopAccessorService(); has.init(conf); JobConf jobConf = has.createJobConf(getNameNodeUri()); XConfiguration.copy(conf, jobConf); fileSystem = has.createFileSystem(getTestUser(), new URI(getNameNodeUri()), jobConf); fsTestDir = initFileSystem(fileSystem); if (System.getProperty("oozie.test.hadoop.minicluster2", "false").equals("true")) { fileSystem2 = has.createFileSystem(getTestUser(), new URI(getNameNode2Uri()), jobConf); fsTestDir2 = initFileSystem(fileSystem2); } }
From source file:eu.scape_project.tpid.TomarPrepareInputdata.java
License:Apache License
/** * Start./* www . j a v a2 s . c om*/ * * @param args Command line arguments * @throws IOException * @throws ParseException */ private static void start(String[] args) throws IOException, ParseException { // hadoop configuration Configuration hadoopConf = new Configuration(); // Command line interface config = new TpidCliConfig(); CommandLineParser cmdParser = new PosixParser(); GenericOptionsParser gop = new GenericOptionsParser(hadoopConf, args); TpidOptions tpidOptions = new TpidOptions(); CommandLine cmd = cmdParser.parse(tpidOptions.options, gop.getRemainingArgs()); if ((args.length == 0) || (cmd.hasOption(tpidOptions.HELP_OPT))) { tpidOptions.exit("Help", 0); } else { tpidOptions.initOptions(cmd, config); } // configuration properties if (config.getPropertiesFilePath() != null) { pu = new PropertyUtil(config.getPropertiesFilePath(), true); } else { pu = new PropertyUtil("/eu/scape_project/tpid/config.properties", false); } // cli parameter has priority over default configuration int cliParamNumPerInv = config.getNumItemsPerInvokation(); int defaultNumPerInv = Integer.parseInt(pu.getProp("default.itemsperinvokation")); int numPerInv = (cliParamNumPerInv != 0) ? cliParamNumPerInv : defaultNumPerInv; // setting hadoop configuration parameters so that they can be used // during MapReduce hadoopConf.setInt("num_items_per_task", numPerInv); hadoopConf.set("output_file_suffix", pu.getProp("default.outputfilesuffix")); hadoopConf.set("scape_platform_invoke", pu.getProp("tomar.invoke.command")); hadoopConf.set("unpack_hdfs_path", pu.getProp("default.hdfsdir.unpacked")); hadoopConf.set("joboutput_hdfs_path", pu.getProp("default.hdfsdir.joboutput")); hadoopConf.set("tooloutput_hdfs_path", pu.getProp("default.hdfsdir.toolout")); hadoopConf.set("container_file_suffix", pu.getProp("containerfilesuffix")); hadoopConf.set("tomar_param_pattern", pu.getProp("tomar.param.pattern")); hadoopConf.setBoolean("pseudo_distributed", config.isPseudoDistributed()); startHadoopJob(hadoopConf); }
From source file:fi.tkk.ics.hadoop.bam.cli.CLIMRBAMPlugin.java
License:Open Source License
/** Should be called before accessing any of the protected data such as * samFormat.//from w w w.j av a 2 s. c o m */ @Override public boolean cacheAndSetProperties(CmdLineParser parser) { if (!super.cacheAndSetProperties(parser)) return false; if (!cacheSAMFormat(parser)) return false; final Configuration conf = getConf(); conf.setBoolean(AnySAMInputFormat.TRUST_EXTS_PROPERTY, !parser.getBoolean(noTrustExtsOpt)); // Let the output format know if we're going to merge the output, so that // it doesn't write headers into the intermediate files. conf.setBoolean(KeyIgnoringAnySAMOutputFormat.WRITE_HEADER_PROPERTY, outPath == null); return true; }
From source file:fi.tkk.ics.hadoop.bam.cli.plugins.chipster.Summarize.java
License:Open Source License
@Override protected int run(CmdLineParser parser) { final List<String> args = parser.getRemainingArgs(); switch (args.size()) { case 0:/* ww w .jav a 2 s .c o m*/ return missingArg("WORKDIR"); case 1: return missingArg("LEVELS"); case 2: return missingArg("INPATH"); default: break; } if (!cacheAndSetProperties(parser)) return 3; levels = args.get(1).split(","); for (String l : levels) { try { int lvl = Integer.parseInt(l); if (lvl > 0) continue; System.err.printf("summarize :: summary level '%d' is not positive!\n", lvl); } catch (NumberFormatException e) { System.err.printf("summarize :: summary level '%s' is not an integer!\n", l); } return 3; } wrkDir = new Path(args.get(0)); final Path bam = new Path(args.get(2)); final boolean sort = parser.getBoolean(sortOpt); final Configuration conf = getConf(); conf.setBoolean(AnySAMInputFormat.TRUST_EXTS_PROPERTY, !parser.getBoolean(noTrustExtsOpt)); // Used by Utils.getMergeableWorkFile() to name the output files. wrkFile = bam.getName(); conf.set(Utils.WORK_FILENAME_PROPERTY, wrkFile); conf.setStrings(SummarizeReducer.SUMMARY_LEVELS_PROP, levels); try { try { // There's a lot of different Paths here, and it can get a bit // confusing. Here's how it works: // // - outPath is the output dir for the final merged output, given // with the -o parameter. // // - wrkDir is the user-given path where the outputs of the // reducers go. // // - mergedTmpDir (defined further below) is $wrkDir/sort.tmp: if // we are sorting, the summaries output in the first Hadoop job // are merged in there. // // - mainSortOutputDir is $wrkDir/sorted.tmp: getSortOutputDir() // gives a per-level/strand directory under it, which is used by // doSorting() and mergeOne(). This is necessary because we // cannot have multiple Hadoop jobs outputting into the same // directory at the same time, as explained in the comment in // sortMerged(). // Required for path ".", for example. wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir); mainSortOutputDir = sort ? new Path(wrkDir, "sorted.tmp") : null; if (!runSummary(bam)) return 4; } catch (IOException e) { System.err.printf("summarize :: Summarizing failed: %s\n", e); return 4; } Path mergedTmpDir = null; try { if (sort) { mergedTmpDir = new Path(wrkDir, "sort.tmp"); mergeOutputs(mergedTmpDir); } else if (outPath != null) mergeOutputs(outPath); } catch (IOException e) { System.err.printf("summarize :: Merging failed: %s\n", e); return 5; } if (sort) { if (!doSorting(mergedTmpDir)) return 6; // Reset this since SummarySort uses it. conf.set(Utils.WORK_FILENAME_PROPERTY, wrkFile); tryDelete(mergedTmpDir); if (outPath != null) try { sorted = true; mergeOutputs(outPath); } catch (IOException e) { System.err.printf("summarize :: Merging sorted output failed: %s\n", e); return 7; } else { // Move the unmerged results out of the mainSortOutputDir // subdirectories to wrkDir. System.out.println("summarize :: Moving outputs from temporary directories..."); t.start(); try { final FileSystem fs = wrkDir.getFileSystem(conf); for (String lvl : levels) { final FileStatus[] parts; try { parts = fs.globStatus(new Path(new Path(mainSortOutputDir, lvl + "[fr]"), "*-[0-9][0-9][0-9][0-9][0-9][0-9]")); } catch (IOException e) { System.err.printf("summarize :: Couldn't move level %s results: %s", lvl, e); continue; } for (FileStatus part : parts) { final Path path = part.getPath(); try { fs.rename(path, new Path(wrkDir, path.getName())); } catch (IOException e) { System.err.printf("summarize :: Couldn't move '%s': %s", path, e); } } } } catch (IOException e) { System.err.printf("summarize :: Moving results failed: %s", e); } System.out.printf("summarize :: Moved in %d.%03d s.\n", t.stopS(), t.fms()); } tryDelete(mainSortOutputDir); } } catch (ClassNotFoundException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } return 0; }
From source file:fi.tkk.ics.hadoop.bam.cli.plugins.VCFSort.java
License:Open Source License
@Override protected int run(CmdLineParser parser) { final List<String> args = parser.getRemainingArgs(); if (args.isEmpty()) { System.err.println("vcf-sort :: WORKDIR not given."); return 3; }/*from ww w . j a v a 2 s. c o m*/ if (args.size() == 1) { System.err.println("vcf-sort :: INPATH not given."); return 3; } if (!cacheAndSetProperties(parser)) return 3; Path wrkDir = new Path(args.get(0)); final Path inPath = new Path(args.get(1)); final Configuration conf = getConf(); VCFFormat vcfFormat = null; final String f = (String) parser.getOptionValue(formatOpt); if (f != null) { try { vcfFormat = VCFFormat.valueOf(f.toUpperCase(Locale.ENGLISH)); } catch (IllegalArgumentException e) { System.err.printf("%s :: invalid format '%s'\n", getCommandName(), f); return 3; } } if (vcfFormat == null) vcfFormat = outPath == null ? VCFFormat.BCF : VCFFormat.inferFromFilePath(outPath); conf.setBoolean(VCFInputFormat.TRUST_EXTS_PROPERTY, !parser.getBoolean(noTrustExtsOpt)); conf.setBoolean(KeyIgnoringVCFOutputFormat.WRITE_HEADER_PROPERTY, outPath == null); conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, vcfFormat.toString()); // Used by Utils.getMergeableWorkFile() to name the output files. final String intermediateOutName = (outPath == null ? inPath : outPath).getName(); conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName); conf.set(SortOutputFormat.INPUT_PATH_PROP, inPath.toString()); final Timer t = new Timer(); try { // Required for path ".", for example. wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir); Utils.configureSampling(wrkDir, intermediateOutName, conf); final Job job = new Job(conf); job.setJarByClass(VCFSort.class); job.setMapperClass(Mapper.class); job.setReducerClass(VCFSortReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(VariantContextWritable.class); job.setInputFormatClass(VCFInputFormat.class); job.setOutputFormatClass(SortOutputFormat.class); FileInputFormat.addInputPath(job, inPath); FileOutputFormat.setOutputPath(job, wrkDir); job.setPartitionerClass(TotalOrderPartitioner.class); System.out.println("vcf-sort :: Sampling..."); t.start(); InputSampler.<LongWritable, VariantContextWritable>writePartitionFile(job, new InputSampler.RandomSampler<LongWritable, VariantContextWritable>(0.01, 10000, Math.max(100, reduceTasks))); System.out.printf("vcf-sort :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms()); job.submit(); System.out.println("vcf-sort :: Waiting for job completion..."); t.start(); if (!job.waitForCompletion(verbose)) { System.err.println("vcf-sort :: Job failed."); return 4; } System.out.printf("vcf-sort :: Job complete in %d.%03d s.\n", t.stopS(), t.fms()); } catch (IOException e) { System.err.printf("vcf-sort :: Hadoop error: %s\n", e); return 4; } catch (ClassNotFoundException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } if (outPath != null) try { System.out.println("vcf-sort :: Merging output..."); t.start(); final OutputStream outs = outPath.getFileSystem(conf).create(outPath); // First, place the VCF or BCF header. final WrapSeekable ins = WrapSeekable.openPath(conf, inPath); final VCFHeader header = VCFHeaderReader.readHeaderFrom(ins); ins.close(); final VariantContextWriter writer; switch (vcfFormat) { case VCF: writer = VariantContextWriterFactory.create(new FilterOutputStream(outs) { @Override public void close() throws IOException { this.out.flush(); } }, null, VariantContextWriterFactory.NO_OPTIONS); break; case BCF: writer = VariantContextWriterFactory .create(new FilterOutputStream(new BlockCompressedOutputStream(outs, null)) { @Override public void close() throws IOException { this.out.flush(); } }, null, EnumSet.of(Options.FORCE_BCF)); break; default: assert false; writer = null; break; } writer.writeHeader(header); writer.close(); // Then, the actual VCF or BCF contents. Utils.mergeInto(outs, wrkDir, "", "", conf, "vcf-sort"); // And if BCF, the BGZF terminator. if (vcfFormat == VCFFormat.BCF) outs.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); outs.close(); System.out.printf("vcf-sort :: Merging complete in %d.%03d s.\n", t.stopS(), t.fms()); } catch (IOException e) { System.err.printf("vcf-sort :: Output merging failed: %s\n", e); return 5; } return 0; }
From source file:fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop.ExpressionHadoopModule.java
License:LGPL
/** * Create JobConf object for HTSeq-count. * @param context the task context/*from w w w. ja v a2 s . co m*/ * @param alignmentsData alignment data * @param featureAnnotationData feature annotations data * @param gtfFormat true if the annotation file is in GTF format * @param genomeDescriptionData genome description data * @param genomicType genomic type * @param attributeId attributeId * @param splitAttributeValues split attribute values * @param stranded stranded mode * @param overlapMode overlap mode * @param removeAmbiguousCases true to remove ambiguous cases * @throws IOException if an error occurs while creating job * @throws BadBioEntryException if an entry of the annotation file is invalid * @throws EoulsanException if the job creating fails */ private static Job createJobHTSeqCounter(final Configuration parentConf, final TaskContext context, final Data alignmentsData, final Data featureAnnotationData, final boolean gtfFormat, final Data genomeDescriptionData, final Data outData, final String genomicType, final String attributeId, final boolean splitAttributeValues, final StrandUsage stranded, final OverlapMode overlapMode, final boolean removeAmbiguousCases, final boolean tsamFormat) throws IOException, BadBioEntryException, EoulsanException { final Configuration jobConf = new Configuration(parentConf); // Get input DataFile DataFile inputDataFile = alignmentsData.getDataFile(); if (inputDataFile == null) { throw new IOException("No input file found."); } final String dataFileSource; if (tsamFormat) { dataFileSource = StringUtils.filenameWithoutExtension(inputDataFile.getSource()) + TSAM_EXTENSION; } else { dataFileSource = inputDataFile.getSource(); } // Set input path final Path inputPath = new Path(dataFileSource); // Get annotation DataFile final DataFile annotationDataFile = featureAnnotationData.getDataFile(); // Get output file final DataFile outFile = outData.getDataFile(); // Get temporary file final DataFile tmpFile = new DataFile(outFile.getParent(), outFile.getBasename() + ".tmp"); getLogger().fine("sample: " + alignmentsData.getName()); getLogger().fine("inputPath.getName(): " + inputPath.getName()); getLogger().fine("annotationDataFile: " + annotationDataFile.getSource()); getLogger().fine("outFile: " + outFile.getSource()); getLogger().fine("tmpFile: " + tmpFile.getSource()); jobConf.set("mapred.child.java.opts", "-Xmx1024m"); // Set counter group jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP); // Set Genome description path final DataFile genomeDescDataFile = genomeDescriptionData.getDataFile(); jobConf.set(GENOME_DESC_PATH_KEY, genomeDescDataFile.getSource()); // Set the "stranded" parameter jobConf.set(HTSeqCountMapper.STRANDED_PARAM, stranded.getName()); // Set the "overlap mode" parameter jobConf.set(HTSeqCountMapper.OVERLAP_MODE_PARAM, overlapMode.getName()); // Set the "remove ambiguous cases" parameter jobConf.setBoolean(HTSeqCountMapper.REMOVE_AMBIGUOUS_CASES, removeAmbiguousCases); final Path featuresIndexPath = getAnnotationIndexSerializedPath(featureAnnotationData.getDataFile()); getLogger().info("featuresIndexPath: " + featuresIndexPath); // Create serialized feature index if (!PathUtils.isFile(featuresIndexPath, jobConf)) { final Locker lock = createZookeeperLock(parentConf, context); lock.lock(); createFeaturesIndex(context, annotationDataFile, gtfFormat, genomicType, attributeId, splitAttributeValues, stranded, genomeDescDataFile, featuresIndexPath, jobConf); lock.unlock(); } // Create the job and its name final Job job = Job.getInstance(jobConf, "Expression computation with htseq-count (" + alignmentsData.getName() + ", " + inputPath.getName() + ", " + annotationDataFile.getSource() + ", " + genomicType + ", " + attributeId + ", stranded: " + stranded + ", removeAmbiguousCases: " + removeAmbiguousCases + ")"); // Set the path to the features index job.addCacheFile(featuresIndexPath.toUri()); // Set the jar job.setJarByClass(ExpressionHadoopModule.class); // Set input path FileInputFormat.setInputPaths(job, inputPath); // Set input format job.setInputFormatClass(SAMInputFormat.class); // Set the mapper class job.setMapperClass(HTSeqCountMapper.class); // Set the combiner class job.setCombinerClass(HTSeqCountReducer.class); // Set the reducer class job.setReducerClass(HTSeqCountReducer.class); // Set the output format job.setOutputFormatClass(ExpressionOutputFormat.class); // Set the output key class job.setOutputKeyClass(Text.class); // Set the output value class job.setOutputValueClass(LongWritable.class); // Set output path FileOutputFormat.setOutputPath(job, new Path(tmpFile.getSource())); return job; }
From source file:gaffer.accumulo.bulkimport.BulkImportDriver.java
License:Apache License
public int run(String[] args) throws Exception { // Usage//from w ww .j av a 2 s .c o m if (args.length < 3) { System.err.println("Usage: " + BulkImportDriver.class.getName() + " <inputpath> <output_path> <accumulo_properties_file>"); return 1; } // Gets paths Path inputPath = new Path(args[0]); Path outputPath = new Path(args[1] + "/data_for_accumulo/"); Path splitsFilePath = new Path(args[1] + "/splits_file"); String accumuloPropertiesFile = args[2]; // Hadoop configuration Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Connect to Accumulo AccumuloConfig accConf = new AccumuloConfig(accumuloPropertiesFile); Connector conn = Accumulo.connect(accConf); String tableName = accConf.getTable(); // Check if the table exists if (!conn.tableOperations().exists(tableName)) { System.err.println("Table " + tableName + " does not exist - create the table before running this"); return 1; } // Get the current splits from the table. // (This assumes that we have already created the table using <code>InitialiseTable</code>.) Collection<Text> splits = conn.tableOperations().getSplits(tableName); int numSplits = splits.size(); System.out.println("Number of splits in table is " + numSplits); // Write current splits to a file (this is needed so that the following MapReduce // job can move them to the DistributedCache). IngestUtils.createSplitsFile(conn, tableName, fs, splitsFilePath); // Run MapReduce to output data suitable for bulk import to Accumulo // Conf and job conf.setBoolean("mapred.compress.map.output", true); conf.setClass("mapred.map.output.compression.codec", SnappyCodec.class, CompressionCodec.class); Job job = new Job(conf); job.setJarByClass(getClass()); job.setJobName("Convert data to Accumulo format: input = " + inputPath + ", output = " + outputPath); // Input job.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, inputPath); // Mapper job.setMapperClass(BulkImportMapper.class); job.setMapOutputKeyClass(Key.class); job.setMapOutputValueClass(Value.class); // Partitioner job.setPartitionerClass(KeyRangePartitioner.class); KeyRangePartitioner.setSplitFile(job, splitsFilePath.toString()); // Reducer job.setReducerClass(BulkImportReducer.class); job.setOutputKeyClass(Key.class); job.setOutputValueClass(Value.class); job.setNumReduceTasks(numSplits + 1); // Output job.setOutputFormatClass(AccumuloFileOutputFormat.class); AccumuloFileOutputFormat.setOutputPath(job, outputPath); // Run job job.waitForCompletion(true); // Successful? if (!job.isSuccessful()) { System.err.println("Error running job"); return 1; } return 0; }
From source file:gaffer.accumulo.inputformat.example.ExampleDriver.java
License:Apache License
public int run(String[] args) throws Exception { // Usage/* w ww . j av a 2 s . c om*/ if (args.length != 6 && args.length != 7) { System.err.println(USAGE); return 1; } // Parse options Path outputPath = new Path(args[0]); String accumuloPropertiesFile = args[1]; int numReduceTasks; try { numReduceTasks = Integer.parseInt(args[2]); } catch (NumberFormatException e) { System.err.println(USAGE); return 1; } Date startDate = null; Date endDate = null; boolean useTimeWindow = false; if (!args[3].equals("null") && !args[4].equals("null")) { try { startDate = DATE_FORMAT.parse(args[3]); endDate = DATE_FORMAT.parse(args[4]); } catch (ParseException e) { System.err.println("Error parsing dates: " + args[3] + " " + args[4] + " " + e.getMessage()); return 1; } useTimeWindow = true; } boolean rollUpOverTimeAndVisibility = Boolean.parseBoolean(args[5]); boolean seedsSpecified = (args.length == 7); String seedsFile = ""; if (seedsSpecified) { seedsFile = args[6]; } // Hadoop configuration Configuration conf = getConf(); // Connect to Accumulo, so we can check connection and check that the // table exists AccumuloConfig accConf = new AccumuloConfig(accumuloPropertiesFile); Connector conn = Accumulo.connect(accConf); String tableName = accConf.getTable(); Authorizations authorizations = conn.securityOperations().getUserAuthorizations(accConf.getUserName()); // Check if the table exists if (!conn.tableOperations().exists(tableName)) { System.err.println("Table " + tableName + " does not exist."); return 1; } // Create AccumuloBackedGraph and set view AccumuloBackedGraph graph = new AccumuloBackedGraph(conn, tableName); // - Time window if (useTimeWindow) { graph.setTimeWindow(startDate, endDate); } // - Roll up over time and visibility iterator graph.rollUpOverTimeAndVisibility(rollUpOverTimeAndVisibility); // - If not specifying seeds then add iterator to avoid seeing the same edge multiple times if (seedsSpecified) { Set<TypeValue> typeValues = new HashSet<TypeValue>(); BufferedReader reader = new BufferedReader(new FileReader(seedsFile)); String line; while ((line = reader.readLine()) != null) { String[] tokens = line.split("\\|"); if (tokens.length != 2) { System.err.println("Invalid line: " + line); continue; } String type = tokens[0]; String value = tokens[1]; typeValues.add(new TypeValue(type, value)); } reader.close(); // Use AccumuloBackedGraph to update the configuration with the view added above graph.setConfiguration(conf, typeValues, accConf); } else { // Use AccumuloBackedGraph to update the configuration with the view added above graph.setConfiguration(conf, accConf); } // Conf conf.setBoolean("mapred.compress.map.output", true); conf.setClass("mapred.map.output.compression.codec", SnappyCodec.class, CompressionCodec.class); // Job Job job = new Job(conf); job.setJarByClass(getClass()); job.setJobName("Example MapReduce against Gaffer data in Accumulo format: input = " + tableName + ", output = " + outputPath); // Input format - use BatchScannerElementInputFormat if seeds have been specified (as that creates fewer // splits); otherwise use ElementInputFormat which is based on the standard AccumuloInputFormat. if (seedsSpecified) { job.setInputFormatClass(BatchScannerElementInputFormat.class); } else { job.setInputFormatClass(ElementInputFormat.class); } // Mapper job.setMapperClass(ExampleMapper.class); job.setMapOutputKeyClass(GraphElement.class); job.setMapOutputValueClass(SetOfStatistics.class); // Reducer - use default IdentityReducer for this example job.setOutputKeyClass(GraphElement.class); job.setOutputValueClass(SetOfStatistics.class); job.setNumReduceTasks(numReduceTasks); // Output job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, outputPath); System.out.println("Running MapReduce job over:"); System.out.println("\tTable: " + accConf.getTable()); System.out.println("\tUser: " + accConf.getUserName()); System.out.println("\tAuths: " + authorizations); if (useTimeWindow) { System.out.println("\tFilter by time: start time is " + DATE_FORMAT.format(startDate) + ", " + DATE_FORMAT.format(endDate)); } else { System.out.println("\tFilter by time is off"); } System.out.println("\tRoll up over time and visibility: " + rollUpOverTimeAndVisibility); // Run job job.waitForCompletion(true); // Successful? if (!job.isSuccessful()) { System.err.println("Error running job"); return 1; } return 0; }