List of usage examples for org.apache.hadoop.conf Configuration setStrings
public void setStrings(String name, String... values)
name
property as as comma delimited values. From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java
License:Apache License
/** * pass1: generate collocations, ngrams/*from w w w .ja v a 2 s. c om*/ */ private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport, Window mode, int winsize) throws IOException, ClassNotFoundException, InterruptedException { Configuration con = new Configuration(baseConf); con.setBoolean(EMIT_UNIGRAMS, emitUnigrams); con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize); con.setInt(CollocReducer.MIN_SUPPORT, minSupport); con.set(WINDOW_TYPE, mode.toString()); con.setInt(WINDOW_SIZE, winsize); if (mode.toString().equalsIgnoreCase("DOCUMENT")) { con.setInt("mapred.job.map.memory.mb", 3000); con.set("mapred.child.java.opts", "-Xmx2900M"); con.set("mapred.reduce.child.java.opts", "-Xmx8000M"); con.setInt("mapred.job.reduce.memory.mb", 8120); } else { con.setInt("mapred.job.map.memory.mb", 2000); con.set("mapred.child.java.opts", "-Xmx1900M"); con.set("mapred.reduce.child.java.opts", "-Xmx2900M"); con.setInt("mapred.job.reduce.memory.mb", 3000); } con.setBoolean("mapred.compress.map.output", true); con.setStrings("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); con.setBoolean("mapred.compress.output", true); con.setStrings("mapred.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); con.setInt("mapred.task.timeout", 6000000); con.setInt("io.sort.factor", 50); con.setInt("mapreduce.map.tasks", 256); con.setInt("dfs.replication", 1); Job job = new Job(con); job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(GramKey.class); job.setMapOutputValueClass(Gram.class); job.setPartitionerClass(GramKeyPartitioner.class); job.setGroupingComparatorClass(GramKeyGroupComparator.class); job.setOutputKeyClass(Gram.class); job.setOutputValueClass(Gram.class); job.setCombinerClass(CollocCombiner.class); FileInputFormat.setInputPaths(job, input); Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CollocMapper.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(CollocReducer.class); job.setNumReduceTasks(512); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue(); }
From source file:diamondmapreduce.DiamondMapReduce.java
License:Apache License
int launchHamond(String[] arguments) throws Exception { //extract diamond, query, reference and output from array String diamond = arguments[0]; String query = arguments[1];//from w w w. j a v a2s . c o m String dataBase = arguments[2]; String outPut = arguments[3]; //set Hadoop configuration Job job = Job.getInstance(getConf(), "DIAMOND"); Configuration conf = job.getConfiguration(); SetConf.setHadoopConf(conf); //get user name userName = HadoopUser.getHadoopUser(); //delete all existing DIAMOND files under current Hadoop user DeleteHDFSFiles.deleteAllFiles(userName); //make Hamond directory on HDFS MakeHamondHDFSdir.makedir(conf, userName); //make DIAMOND database on local then copy to HDFS with query and delete local database MakeDB.makeDB(diamond, dataBase); //copy DIAMOND bin, query and local database file to HDFS CopyFromLocal.copyFromLocal(conf, diamond, query, dataBase, userName); //pass query name and database name to mappers conf.set(QUERY, query); conf.set(DATABASE, dataBase + ".dmnd"); String[] subArgs = Arrays.copyOfRange(arguments, 4, arguments.length); conf.setStrings("DIAMOND-arguments", subArgs); conf.setStrings(OUTPUT, outPut); //add DIAMOND bin and database into distributed cache job.addCacheFile(new URI("/user/" + userName + "/Hamond/diamond")); job.addCacheFile(new URI("/user/" + userName + "/Hamond/" + new Path(dataBase).getName() + ".dmnd")); //set job input and output paths FileInputFormat.addInputPath(job, new Path("/user/" + userName + "/Hamond/" + new Path(query).getName())); FileOutputFormat.setOutputPath(job, new Path("/user/" + userName + "/Hamond/out")); //set job driver and mapper job.setJarByClass(DiamondMapReduce.class); job.setMapperClass(DiamondMapper.class); //set job input format into customized multilines format job.setInputFormatClass(CustomNLineFileInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(0); return job.waitForCompletion(true) ? 0 : 1; }
From source file:diamondmapreduce.DiamondMapReduce.java
License:Apache License
int launchHamondAWS(String[] arguments) throws Exception { //extract diamond, query, reference and output from array String diamond = arguments[0]; String query = arguments[1];//from w ww . j av a 2 s . c o m String dataBase = arguments[2]; String outPut = arguments[3]; //set Hadoop configuration Job job = Job.getInstance(getConf(), "DIAMOND"); Configuration conf = job.getConfiguration(); SetConf.setHadoopConf(conf); //get user name userName = HadoopUser.getHadoopUser(); //delete all existing DIAMOND files under current Hadoop user DeleteHDFSFiles.deleteAllFiles(userName); //make local Hamond dir awshamondsidefunctions.MakeHamondDir.make(); //copy DIAMOND, query, reference from S3 to master local awshamondsidefunctions.CopyFromS3.copyFromS3(diamond, query, dataBase); //make Hamond directory on HDFS MakeHamondHDFSdir.makedir(conf, userName); //make DIAMOND database on local then copy to HDFS with query and delete local database MakeDB.makeDB("/mnt/Hamond/diamond", "/mnt/Hamond/" + new Path(dataBase).getName()); //copy DIAMOND bin, query and local database file to HDFS CopyFromLocal.copyFromLocal(conf, "/mnt/Hamond/diamond", "/mnt/Hamond/" + new Path(query).getName(), "/mnt/Hamond/" + new Path(dataBase).getName(), userName); //pass query name and database name to mappers conf.set(QUERY, query); conf.set(DATABASE, dataBase); conf.set(OUTPUT, outPut); String[] subArgs = Arrays.copyOfRange(arguments, 4, arguments.length); conf.setStrings("DIAMOND-arguments", subArgs); conf.setStrings(OUTPUT, outPut); //add DIAMOND bin and database into distributed cache job.addCacheFile(new URI("/user/" + userName + "/Hamond/diamond")); job.addCacheFile(new URI("/user/" + userName + "/Hamond/" + new Path(dataBase).getName() + ".dmnd")); //set job input and output paths FileInputFormat.addInputPath(job, new Path("/user/" + userName + "/Hamond/" + new Path(query).getName())); FileOutputFormat.setOutputPath(job, new Path("/user/" + userName + "/Hamond/out")); //set job driver and mapper job.setJarByClass(DiamondMapReduce.class); job.setMapperClass(DiamondMapper.class); job.setReducerClass(AWSDiamondReducer.class); //set job input format into customized multilines format job.setInputFormatClass(CustomNLineFileInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(1); return job.waitForCompletion(true) ? 0 : 1; }
From source file:edu.cuhk.hccl.hadoop.HadoopApp.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args == null || args.length < 4) { System.out.println("Please specify parameters: input, output, domain, num-reducers!"); System.exit(-1);//from www. ja v a 2s .co m } String input = args[0]; String output = args[1]; String domain = args[2]; int numReducers = Integer.parseInt(args[3]); float similarity = Float.parseFloat(args[4]); int range = Integer.parseInt(args[5]); Job job = new Job(new Configuration(), this.getClass().getSimpleName()); // Must below the line of job creation Configuration conf = job.getConfiguration(); // Reuse the JVM conf.setInt("mapred.job.reuse.jvm.num.tasks", -1); conf.setFloat("SIM_THRESHOLD", similarity); conf.setInt("SEARCH_RANGE", range); if (domain.equalsIgnoreCase("restaurant")) { conf.setStrings("ASPECTS", Constant.RESTAURANT_ASPECTS); job.setMapperClass(YelpMapper.class); job.setInputFormatClass(TextInputFormat.class); // args[4] is the business file to select matching business_ids to restaurant String busiFile = args[6]; DistributedCache.addCacheFile(new URI(busiFile), conf); } else if (domain.equalsIgnoreCase("hotel")) { conf.setStrings("ASPECTS", Constant.TRIPADVISOR_ASPECTS); job.setMapperClass(TripAdvisorMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); } else { System.out.println("Wrong domain type!"); System.exit(-1); } job.setJarByClass(HadoopApp.class); job.setReducerClass(ReviewReducer.class); job.setNumReduceTasks(numReducers); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(UserItemPair.class); job.setOutputValueClass(NounPhrase.class); // Delete output if exists Path outputDir = new Path(output); FileSystem hdfs = FileSystem.get(conf); if (hdfs.exists(outputDir)) hdfs.delete(outputDir, true); FileInputFormat.setInputPaths(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.waitForCompletion(true); return 0; }
From source file:edu.indiana.d2i.htrc.io.mem.MemCachedUtil.java
License:Apache License
public static void configHelper(Configuration conf, String memhostsPath) throws IOException { List<String> hosts = new ArrayList<String>(); FileSystem fs = FileSystem.get(conf); DataInputStream fsinput = new DataInputStream(fs.open(new Path(memhostsPath))); BufferedReader reader = new BufferedReader(new InputStreamReader(fsinput)); String line = null;//from w w w.j a v a 2s .c om while ((line = reader.readLine()) != null) { hosts.add(line); } reader.close(); String[] hostsArray = hosts.toArray(new String[hosts.size()]); conf.setInt(HTRCConstants.MEMCACHED_CLIENT_NUM, 1); // conf.setInt(HTRCConstants.MEMCACHED_MAX_EXPIRE, Integer.MAX_VALUE); conf.setInt(HTRCConstants.MEMCACHED_MAX_EXPIRE, 60 * 60 * 60); // seconds conf.setStrings(HTRCConstants.MEMCACHED_HOSTS, hostsArray); }
From source file:edu.umd.gorden2.BuildPersonalizedPageRankRecords.java
License:Apache License
/** * Runs this tool./*from w w w. j av a 2s . co m*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES)); options.addOption( OptionBuilder.withArgName("sources").hasArg().withDescription("sources").create("sources")); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int n = Integer.parseInt(cmdline.getOptionValue(NUM_NODES)); String m = cmdline.getOptionValue("sources"); LOG.info("Tool name: " + BuildPersonalizedPageRankRecords.class.getSimpleName()); LOG.info(" - inputDir: " + inputPath); LOG.info(" - outputDir: " + outputPath); LOG.info(" - numNodes: " + n); LOG.info(" - sources: " + m); Configuration conf = getConf(); conf.setInt(NODE_CNT_FIELD, n); conf.setStrings("sources", m); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); Job job = Job.getInstance(conf); job.setJobName(BuildPersonalizedPageRankRecords.class.getSimpleName() + ":" + inputPath); job.setJarByClass(BuildPersonalizedPageRankRecords.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNode.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }
From source file:edu.umd.shrawanraina.BuildPersonalizedPageRankRecords.java
License:Apache License
/** * Runs this tool./*from www.j a v a 2 s .c om*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES)); options.addOption( OptionBuilder.withArgName("node").hasArg().withDescription("source nodes").create(SOURCES)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES) || !cmdline.hasOption(SOURCES)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int n = Integer.parseInt(cmdline.getOptionValue(NUM_NODES)); String sources = cmdline.getOptionValue(SOURCES); LOG.info("Tool name: " + BuildPersonalizedPageRankRecords.class.getSimpleName()); LOG.info(" - inputDir: " + inputPath); LOG.info(" - outputDir: " + outputPath); LOG.info(" - numNodes: " + n); LOG.info(" - sources: " + sources); Configuration conf = getConf(); conf.setInt(NODE_CNT_FIELD, n); conf.setStrings("sources", sources); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); Job job = Job.getInstance(conf); job.setJobName(BuildPersonalizedPageRankRecords.class.getSimpleName() + ":" + inputPath); job.setJarByClass(BuildPersonalizedPageRankRecords.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNodeUpd.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNodeUpd.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }
From source file:edu.umd.shrawanraina.RunPersonalizedPageRankBasic.java
License:Apache License
/** * Runs this tool.//from www .jav a2s . c om */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(COMBINER, "use combiner")); options.addOption(new Option(INMAPPER_COMBINER, "user in-mapper combiner")); options.addOption(new Option(RANGE, "use range partitioner")); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("base path").create(BASE)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("start iteration").create(START)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("end iteration").create(END)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES)); options.addOption( OptionBuilder.withArgName("node").hasArg().withDescription("source nodes").create(SOURCES)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(BASE) || !cmdline.hasOption(START) || !cmdline.hasOption(END) || !cmdline.hasOption(NUM_NODES) || !cmdline.hasOption(SOURCES)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String basePath = cmdline.getOptionValue(BASE); int n = Integer.parseInt(cmdline.getOptionValue(NUM_NODES)); int s = Integer.parseInt(cmdline.getOptionValue(START)); int e = Integer.parseInt(cmdline.getOptionValue(END)); String sources = cmdline.getOptionValue(SOURCES); boolean useCombiner = cmdline.hasOption(COMBINER); boolean useInmapCombiner = cmdline.hasOption(INMAPPER_COMBINER); boolean useRange = cmdline.hasOption(RANGE); LOG.info("Tool name: RunPageRank"); LOG.info(" - base path: " + basePath); LOG.info(" - num nodes: " + n); LOG.info(" - start iteration: " + s); LOG.info(" - end iteration: " + e); LOG.info(" - sources: " + Arrays.asList(sources.split("\\s*(,)\\s*"))); LOG.info(" - use combiner: " + useCombiner); LOG.info(" - use in-mapper combiner: " + useInmapCombiner); LOG.info(" - user range partitioner: " + useRange); Configuration conf = getConf(); conf.setStrings("sources", sources); // Iterate PageRank. for (int i = s; i < e; i++) { iteratePageRank(sources, i, i + 1, basePath, n, useCombiner, useInmapCombiner); } return 0; }
From source file:fi.tkk.ics.hadoop.bam.cli.plugins.chipster.Summarize.java
License:Open Source License
@Override protected int run(CmdLineParser parser) { final List<String> args = parser.getRemainingArgs(); switch (args.size()) { case 0://from w w w . j a v a 2 s . c o m return missingArg("WORKDIR"); case 1: return missingArg("LEVELS"); case 2: return missingArg("INPATH"); default: break; } if (!cacheAndSetProperties(parser)) return 3; levels = args.get(1).split(","); for (String l : levels) { try { int lvl = Integer.parseInt(l); if (lvl > 0) continue; System.err.printf("summarize :: summary level '%d' is not positive!\n", lvl); } catch (NumberFormatException e) { System.err.printf("summarize :: summary level '%s' is not an integer!\n", l); } return 3; } wrkDir = new Path(args.get(0)); final Path bam = new Path(args.get(2)); final boolean sort = parser.getBoolean(sortOpt); final Configuration conf = getConf(); conf.setBoolean(AnySAMInputFormat.TRUST_EXTS_PROPERTY, !parser.getBoolean(noTrustExtsOpt)); // Used by Utils.getMergeableWorkFile() to name the output files. wrkFile = bam.getName(); conf.set(Utils.WORK_FILENAME_PROPERTY, wrkFile); conf.setStrings(SummarizeReducer.SUMMARY_LEVELS_PROP, levels); try { try { // There's a lot of different Paths here, and it can get a bit // confusing. Here's how it works: // // - outPath is the output dir for the final merged output, given // with the -o parameter. // // - wrkDir is the user-given path where the outputs of the // reducers go. // // - mergedTmpDir (defined further below) is $wrkDir/sort.tmp: if // we are sorting, the summaries output in the first Hadoop job // are merged in there. // // - mainSortOutputDir is $wrkDir/sorted.tmp: getSortOutputDir() // gives a per-level/strand directory under it, which is used by // doSorting() and mergeOne(). This is necessary because we // cannot have multiple Hadoop jobs outputting into the same // directory at the same time, as explained in the comment in // sortMerged(). // Required for path ".", for example. wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir); mainSortOutputDir = sort ? new Path(wrkDir, "sorted.tmp") : null; if (!runSummary(bam)) return 4; } catch (IOException e) { System.err.printf("summarize :: Summarizing failed: %s\n", e); return 4; } Path mergedTmpDir = null; try { if (sort) { mergedTmpDir = new Path(wrkDir, "sort.tmp"); mergeOutputs(mergedTmpDir); } else if (outPath != null) mergeOutputs(outPath); } catch (IOException e) { System.err.printf("summarize :: Merging failed: %s\n", e); return 5; } if (sort) { if (!doSorting(mergedTmpDir)) return 6; // Reset this since SummarySort uses it. conf.set(Utils.WORK_FILENAME_PROPERTY, wrkFile); tryDelete(mergedTmpDir); if (outPath != null) try { sorted = true; mergeOutputs(outPath); } catch (IOException e) { System.err.printf("summarize :: Merging sorted output failed: %s\n", e); return 7; } else { // Move the unmerged results out of the mainSortOutputDir // subdirectories to wrkDir. System.out.println("summarize :: Moving outputs from temporary directories..."); t.start(); try { final FileSystem fs = wrkDir.getFileSystem(conf); for (String lvl : levels) { final FileStatus[] parts; try { parts = fs.globStatus(new Path(new Path(mainSortOutputDir, lvl + "[fr]"), "*-[0-9][0-9][0-9][0-9][0-9][0-9]")); } catch (IOException e) { System.err.printf("summarize :: Couldn't move level %s results: %s", lvl, e); continue; } for (FileStatus part : parts) { final Path path = part.getPath(); try { fs.rename(path, new Path(wrkDir, path.getName())); } catch (IOException e) { System.err.printf("summarize :: Couldn't move '%s': %s", path, e); } } } } catch (IOException e) { System.err.printf("summarize :: Moving results failed: %s", e); } System.out.printf("summarize :: Moved in %d.%03d s.\n", t.stopS(), t.fms()); } tryDelete(mainSortOutputDir); } } catch (ClassNotFoundException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } return 0; }
From source file:fi.tkk.ics.hadoop.bam.cli.plugins.FixMate.java
License:Open Source License
@Override protected int run(CmdLineParser parser) { final List<String> args = parser.getRemainingArgs(); if (args.isEmpty()) { System.err.println("fixmate :: WORKDIR not given."); return 3; }// ww w .j a v a 2 s .c om if (args.size() == 1) { System.err.println("fixmate :: INPATH not given."); return 3; } if (!cacheAndSetProperties(parser)) return 3; final SAMFileReader.ValidationStringency stringency = Utils.toStringency(parser.getOptionValue( stringencyOpt, SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY.toString()), "fixmate"); if (stringency == null) return 3; Path wrkDir = new Path(args.get(0)); final List<String> strInputs = args.subList(1, args.size()); final List<Path> inputs = new ArrayList<Path>(strInputs.size()); for (final String in : strInputs) inputs.add(new Path(in)); final Configuration conf = getConf(); // Used by Utils.getMergeableWorkFile() to name the output files. final String intermediateOutName = (outPath == null ? inputs.get(0) : outPath).getName(); conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName); if (stringency != null) conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, stringency.toString()); final boolean globalSort = parser.getBoolean(sortOpt); if (globalSort) Utils.setHeaderMergerSortOrder(conf, SAMFileHeader.SortOrder.queryname); conf.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0])); final Timer t = new Timer(); try { // Required for path ".", for example. wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir); if (globalSort) Utils.configureSampling(wrkDir, intermediateOutName, conf); final Job job = new Job(conf); job.setJarByClass(FixMate.class); job.setMapperClass(FixMateMapper.class); job.setReducerClass(FixMateReducer.class); if (!parser.getBoolean(noCombinerOpt)) job.setCombinerClass(FixMateReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(SAMRecordWritable.class); job.setInputFormatClass(AnySAMInputFormat.class); job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class); for (final Path in : inputs) FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, wrkDir); if (globalSort) { job.setPartitionerClass(TotalOrderPartitioner.class); System.out.println("fixmate :: Sampling..."); t.start(); InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01, 10000, Math.max(100, reduceTasks))); System.out.printf("fixmate :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms()); } job.submit(); System.out.println("fixmate :: Waiting for job completion..."); t.start(); if (!job.waitForCompletion(verbose)) { System.err.println("fixmate :: Job failed."); return 4; } System.out.printf("fixmate :: Job complete in %d.%03d s.\n", t.stopS(), t.fms()); } catch (IOException e) { System.err.printf("fixmate :: Hadoop error: %s\n", e); return 4; } catch (ClassNotFoundException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } if (outPath != null) try { Utils.mergeSAMInto(outPath, wrkDir, "", "", samFormat, conf, "fixmate"); } catch (IOException e) { System.err.printf("fixmate :: Output merging failed: %s\n", e); return 5; } return 0; }