Example usage for org.apache.hadoop.conf Configuration setStrings

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setStrings.

Prototype

public void setStrings(String name, String... values)

Source Link

Document

Set the array of string values for the name property as as comma delimited values.

Usage

From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java

License:Apache License

/**
 * pass1: generate collocations, ngrams/*from w w  w .ja v a  2  s.  c  om*/
 */
private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams,
        int maxNGramSize, int reduceTasks, int minSupport, Window mode, int winsize)
        throws IOException, ClassNotFoundException, InterruptedException {

    Configuration con = new Configuration(baseConf);
    con.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
    con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize);
    con.setInt(CollocReducer.MIN_SUPPORT, minSupport);
    con.set(WINDOW_TYPE, mode.toString());
    con.setInt(WINDOW_SIZE, winsize);

    if (mode.toString().equalsIgnoreCase("DOCUMENT")) {
        con.setInt("mapred.job.map.memory.mb", 3000);

        con.set("mapred.child.java.opts", "-Xmx2900M");
        con.set("mapred.reduce.child.java.opts", "-Xmx8000M");

        con.setInt("mapred.job.reduce.memory.mb", 8120);
    } else {
        con.setInt("mapred.job.map.memory.mb", 2000);

        con.set("mapred.child.java.opts", "-Xmx1900M");
        con.set("mapred.reduce.child.java.opts", "-Xmx2900M");

        con.setInt("mapred.job.reduce.memory.mb", 3000);
    }
    con.setBoolean("mapred.compress.map.output", true);
    con.setStrings("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
    con.setBoolean("mapred.compress.output", true);
    con.setStrings("mapred.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
    con.setInt("mapred.task.timeout", 6000000);
    con.setInt("io.sort.factor", 50);
    con.setInt("mapreduce.map.tasks", 256);
    con.setInt("dfs.replication", 1);
    Job job = new Job(con);
    job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(GramKey.class);
    job.setMapOutputValueClass(Gram.class);
    job.setPartitionerClass(GramKeyPartitioner.class);
    job.setGroupingComparatorClass(GramKeyGroupComparator.class);

    job.setOutputKeyClass(Gram.class);
    job.setOutputValueClass(Gram.class);

    job.setCombinerClass(CollocCombiner.class);

    FileInputFormat.setInputPaths(job, input);

    Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(CollocMapper.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setReducerClass(CollocReducer.class);
    job.setNumReduceTasks(512);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue();
}

From source file:diamondmapreduce.DiamondMapReduce.java

License:Apache License

int launchHamond(String[] arguments) throws Exception {

    //extract diamond, query, reference and output from array
    String diamond = arguments[0];
    String query = arguments[1];//from  w  w w. j  a  v a2s .  c o  m
    String dataBase = arguments[2];
    String outPut = arguments[3];

    //set Hadoop configuration
    Job job = Job.getInstance(getConf(), "DIAMOND");
    Configuration conf = job.getConfiguration();
    SetConf.setHadoopConf(conf);

    //get user name
    userName = HadoopUser.getHadoopUser();

    //delete all existing DIAMOND files under current Hadoop user
    DeleteHDFSFiles.deleteAllFiles(userName);

    //make Hamond directory on HDFS
    MakeHamondHDFSdir.makedir(conf, userName);

    //make DIAMOND database on local then copy to HDFS with query and delete local database
    MakeDB.makeDB(diamond, dataBase);

    //copy DIAMOND bin, query and local database file to HDFS
    CopyFromLocal.copyFromLocal(conf, diamond, query, dataBase, userName);

    //pass query name and database name to mappers
    conf.set(QUERY, query);
    conf.set(DATABASE, dataBase + ".dmnd");
    String[] subArgs = Arrays.copyOfRange(arguments, 4, arguments.length);
    conf.setStrings("DIAMOND-arguments", subArgs);
    conf.setStrings(OUTPUT, outPut);

    //add DIAMOND bin and database into distributed cache
    job.addCacheFile(new URI("/user/" + userName + "/Hamond/diamond"));
    job.addCacheFile(new URI("/user/" + userName + "/Hamond/" + new Path(dataBase).getName() + ".dmnd"));

    //set job input and output paths
    FileInputFormat.addInputPath(job, new Path("/user/" + userName + "/Hamond/" + new Path(query).getName()));
    FileOutputFormat.setOutputPath(job, new Path("/user/" + userName + "/Hamond/out"));

    //set job driver and mapper
    job.setJarByClass(DiamondMapReduce.class);
    job.setMapperClass(DiamondMapper.class);

    //set job input format into customized multilines format
    job.setInputFormatClass(CustomNLineFileInputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setNumReduceTasks(0);

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:diamondmapreduce.DiamondMapReduce.java

License:Apache License

int launchHamondAWS(String[] arguments) throws Exception {

    //extract diamond, query, reference and output from array
    String diamond = arguments[0];
    String query = arguments[1];//from  w ww . j av  a  2  s  . c  o m
    String dataBase = arguments[2];
    String outPut = arguments[3];

    //set Hadoop configuration
    Job job = Job.getInstance(getConf(), "DIAMOND");
    Configuration conf = job.getConfiguration();
    SetConf.setHadoopConf(conf);

    //get user name
    userName = HadoopUser.getHadoopUser();

    //delete all existing DIAMOND files under current Hadoop user
    DeleteHDFSFiles.deleteAllFiles(userName);

    //make local Hamond dir
    awshamondsidefunctions.MakeHamondDir.make();

    //copy DIAMOND, query, reference from S3 to master local
    awshamondsidefunctions.CopyFromS3.copyFromS3(diamond, query, dataBase);

    //make Hamond directory on HDFS
    MakeHamondHDFSdir.makedir(conf, userName);

    //make DIAMOND database on local then copy to HDFS with query and delete local database
    MakeDB.makeDB("/mnt/Hamond/diamond", "/mnt/Hamond/" + new Path(dataBase).getName());

    //copy DIAMOND bin, query and local database file to HDFS
    CopyFromLocal.copyFromLocal(conf, "/mnt/Hamond/diamond", "/mnt/Hamond/" + new Path(query).getName(),
            "/mnt/Hamond/" + new Path(dataBase).getName(), userName);

    //pass query name and database name to mappers
    conf.set(QUERY, query);
    conf.set(DATABASE, dataBase);
    conf.set(OUTPUT, outPut);
    String[] subArgs = Arrays.copyOfRange(arguments, 4, arguments.length);
    conf.setStrings("DIAMOND-arguments", subArgs);
    conf.setStrings(OUTPUT, outPut);

    //add DIAMOND bin and database into distributed cache
    job.addCacheFile(new URI("/user/" + userName + "/Hamond/diamond"));
    job.addCacheFile(new URI("/user/" + userName + "/Hamond/" + new Path(dataBase).getName() + ".dmnd"));

    //set job input and output paths
    FileInputFormat.addInputPath(job, new Path("/user/" + userName + "/Hamond/" + new Path(query).getName()));
    FileOutputFormat.setOutputPath(job, new Path("/user/" + userName + "/Hamond/out"));

    //set job driver and mapper
    job.setJarByClass(DiamondMapReduce.class);
    job.setMapperClass(DiamondMapper.class);
    job.setReducerClass(AWSDiamondReducer.class);

    //set job input format into customized multilines format
    job.setInputFormatClass(CustomNLineFileInputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setNumReduceTasks(1);

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:edu.cuhk.hccl.hadoop.HadoopApp.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    if (args == null || args.length < 4) {
        System.out.println("Please specify parameters: input, output, domain, num-reducers!");
        System.exit(-1);//from  www. ja  v a  2s .co m
    }

    String input = args[0];
    String output = args[1];
    String domain = args[2];
    int numReducers = Integer.parseInt(args[3]);
    float similarity = Float.parseFloat(args[4]);
    int range = Integer.parseInt(args[5]);

    Job job = new Job(new Configuration(), this.getClass().getSimpleName());

    // Must below the line of job creation
    Configuration conf = job.getConfiguration();
    // Reuse the JVM
    conf.setInt("mapred.job.reuse.jvm.num.tasks", -1);
    conf.setFloat("SIM_THRESHOLD", similarity);
    conf.setInt("SEARCH_RANGE", range);

    if (domain.equalsIgnoreCase("restaurant")) {
        conf.setStrings("ASPECTS", Constant.RESTAURANT_ASPECTS);
        job.setMapperClass(YelpMapper.class);
        job.setInputFormatClass(TextInputFormat.class);

        // args[4] is the business file to select matching business_ids to restaurant
        String busiFile = args[6];
        DistributedCache.addCacheFile(new URI(busiFile), conf);
    } else if (domain.equalsIgnoreCase("hotel")) {
        conf.setStrings("ASPECTS", Constant.TRIPADVISOR_ASPECTS);
        job.setMapperClass(TripAdvisorMapper.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
    } else {
        System.out.println("Wrong domain type!");
        System.exit(-1);
    }

    job.setJarByClass(HadoopApp.class);
    job.setReducerClass(ReviewReducer.class);
    job.setNumReduceTasks(numReducers);

    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(UserItemPair.class);
    job.setOutputValueClass(NounPhrase.class);

    // Delete output if exists
    Path outputDir = new Path(output);
    FileSystem hdfs = FileSystem.get(conf);
    if (hdfs.exists(outputDir))
        hdfs.delete(outputDir, true);

    FileInputFormat.setInputPaths(job, new Path(input));
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.waitForCompletion(true);
    return 0;
}

From source file:edu.indiana.d2i.htrc.io.mem.MemCachedUtil.java

License:Apache License

public static void configHelper(Configuration conf, String memhostsPath) throws IOException {
    List<String> hosts = new ArrayList<String>();
    FileSystem fs = FileSystem.get(conf);
    DataInputStream fsinput = new DataInputStream(fs.open(new Path(memhostsPath)));
    BufferedReader reader = new BufferedReader(new InputStreamReader(fsinput));
    String line = null;//from  w w  w.j  a  v a 2s .c  om
    while ((line = reader.readLine()) != null) {
        hosts.add(line);
    }
    reader.close();
    String[] hostsArray = hosts.toArray(new String[hosts.size()]);

    conf.setInt(HTRCConstants.MEMCACHED_CLIENT_NUM, 1);
    //      conf.setInt(HTRCConstants.MEMCACHED_MAX_EXPIRE, Integer.MAX_VALUE);
    conf.setInt(HTRCConstants.MEMCACHED_MAX_EXPIRE, 60 * 60 * 60); // seconds
    conf.setStrings(HTRCConstants.MEMCACHED_HOSTS, hostsArray);
}

From source file:edu.umd.gorden2.BuildPersonalizedPageRankRecords.java

License:Apache License

/**
 * Runs this tool./*from w  w  w.  j av  a  2s  .  co  m*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES));
    options.addOption(
            OptionBuilder.withArgName("sources").hasArg().withDescription("sources").create("sources"));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int n = Integer.parseInt(cmdline.getOptionValue(NUM_NODES));
    String m = cmdline.getOptionValue("sources");

    LOG.info("Tool name: " + BuildPersonalizedPageRankRecords.class.getSimpleName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    LOG.info(" - numNodes: " + n);
    LOG.info(" - sources: " + m);

    Configuration conf = getConf();
    conf.setInt(NODE_CNT_FIELD, n);
    conf.setStrings("sources", m);
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    Job job = Job.getInstance(conf);
    job.setJobName(BuildPersonalizedPageRankRecords.class.getSimpleName() + ":" + inputPath);
    job.setJarByClass(BuildPersonalizedPageRankRecords.class);

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.umd.shrawanraina.BuildPersonalizedPageRankRecords.java

License:Apache License

/**
 * Runs this tool./*from www.j a v  a  2  s  .c  om*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES));
    options.addOption(
            OptionBuilder.withArgName("node").hasArg().withDescription("source nodes").create(SOURCES));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES)
            || !cmdline.hasOption(SOURCES)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int n = Integer.parseInt(cmdline.getOptionValue(NUM_NODES));
    String sources = cmdline.getOptionValue(SOURCES);

    LOG.info("Tool name: " + BuildPersonalizedPageRankRecords.class.getSimpleName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    LOG.info(" - numNodes: " + n);
    LOG.info(" - sources: " + sources);

    Configuration conf = getConf();
    conf.setInt(NODE_CNT_FIELD, n);
    conf.setStrings("sources", sources);
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    Job job = Job.getInstance(conf);
    job.setJobName(BuildPersonalizedPageRankRecords.class.getSimpleName() + ":" + inputPath);
    job.setJarByClass(BuildPersonalizedPageRankRecords.class);

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNodeUpd.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNodeUpd.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.umd.shrawanraina.RunPersonalizedPageRankBasic.java

License:Apache License

/**
 * Runs this tool.//from  www .jav  a2s  .  c  om
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(COMBINER, "use combiner"));
    options.addOption(new Option(INMAPPER_COMBINER, "user in-mapper combiner"));
    options.addOption(new Option(RANGE, "use range partitioner"));

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("base path").create(BASE));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("start iteration").create(START));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("end iteration").create(END));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES));
    options.addOption(
            OptionBuilder.withArgName("node").hasArg().withDescription("source nodes").create(SOURCES));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(BASE) || !cmdline.hasOption(START) || !cmdline.hasOption(END)
            || !cmdline.hasOption(NUM_NODES) || !cmdline.hasOption(SOURCES)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String basePath = cmdline.getOptionValue(BASE);
    int n = Integer.parseInt(cmdline.getOptionValue(NUM_NODES));
    int s = Integer.parseInt(cmdline.getOptionValue(START));
    int e = Integer.parseInt(cmdline.getOptionValue(END));
    String sources = cmdline.getOptionValue(SOURCES);
    boolean useCombiner = cmdline.hasOption(COMBINER);
    boolean useInmapCombiner = cmdline.hasOption(INMAPPER_COMBINER);
    boolean useRange = cmdline.hasOption(RANGE);

    LOG.info("Tool name: RunPageRank");
    LOG.info(" - base path: " + basePath);
    LOG.info(" - num nodes: " + n);
    LOG.info(" - start iteration: " + s);
    LOG.info(" - end iteration: " + e);
    LOG.info(" - sources: " + Arrays.asList(sources.split("\\s*(,)\\s*")));
    LOG.info(" - use combiner: " + useCombiner);
    LOG.info(" - use in-mapper combiner: " + useInmapCombiner);
    LOG.info(" - user range partitioner: " + useRange);

    Configuration conf = getConf();
    conf.setStrings("sources", sources);
    // Iterate PageRank.
    for (int i = s; i < e; i++) {
        iteratePageRank(sources, i, i + 1, basePath, n, useCombiner, useInmapCombiner);
    }

    return 0;
}

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.chipster.Summarize.java

License:Open Source License

@Override
protected int run(CmdLineParser parser) {

    final List<String> args = parser.getRemainingArgs();
    switch (args.size()) {
    case 0://from w  w w  . j  a  v  a  2 s .  c  o  m
        return missingArg("WORKDIR");
    case 1:
        return missingArg("LEVELS");
    case 2:
        return missingArg("INPATH");
    default:
        break;
    }
    if (!cacheAndSetProperties(parser))
        return 3;

    levels = args.get(1).split(",");
    for (String l : levels) {
        try {
            int lvl = Integer.parseInt(l);
            if (lvl > 0)
                continue;
            System.err.printf("summarize :: summary level '%d' is not positive!\n", lvl);
        } catch (NumberFormatException e) {
            System.err.printf("summarize :: summary level '%s' is not an integer!\n", l);
        }
        return 3;
    }

    wrkDir = new Path(args.get(0));
    final Path bam = new Path(args.get(2));

    final boolean sort = parser.getBoolean(sortOpt);

    final Configuration conf = getConf();

    conf.setBoolean(AnySAMInputFormat.TRUST_EXTS_PROPERTY, !parser.getBoolean(noTrustExtsOpt));

    // Used by Utils.getMergeableWorkFile() to name the output files.
    wrkFile = bam.getName();
    conf.set(Utils.WORK_FILENAME_PROPERTY, wrkFile);

    conf.setStrings(SummarizeReducer.SUMMARY_LEVELS_PROP, levels);

    try {
        try {
            // There's a lot of different Paths here, and it can get a bit
            // confusing. Here's how it works:
            //
            // - outPath is the output dir for the final merged output, given
            //   with the -o parameter.
            //
            // - wrkDir is the user-given path where the outputs of the
            //   reducers go.
            //
            // - mergedTmpDir (defined further below) is $wrkDir/sort.tmp: if
            //   we are sorting, the summaries output in the first Hadoop job
            //   are merged in there.
            //
            // - mainSortOutputDir is $wrkDir/sorted.tmp: getSortOutputDir()
            //   gives a per-level/strand directory under it, which is used by
            //   doSorting() and mergeOne(). This is necessary because we
            //   cannot have multiple Hadoop jobs outputting into the same
            //   directory at the same time, as explained in the comment in
            //   sortMerged().

            // Required for path ".", for example.
            wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir);

            mainSortOutputDir = sort ? new Path(wrkDir, "sorted.tmp") : null;

            if (!runSummary(bam))
                return 4;
        } catch (IOException e) {
            System.err.printf("summarize :: Summarizing failed: %s\n", e);
            return 4;
        }

        Path mergedTmpDir = null;
        try {
            if (sort) {
                mergedTmpDir = new Path(wrkDir, "sort.tmp");
                mergeOutputs(mergedTmpDir);

            } else if (outPath != null)
                mergeOutputs(outPath);

        } catch (IOException e) {
            System.err.printf("summarize :: Merging failed: %s\n", e);
            return 5;
        }

        if (sort) {
            if (!doSorting(mergedTmpDir))
                return 6;

            // Reset this since SummarySort uses it.
            conf.set(Utils.WORK_FILENAME_PROPERTY, wrkFile);

            tryDelete(mergedTmpDir);

            if (outPath != null)
                try {
                    sorted = true;
                    mergeOutputs(outPath);
                } catch (IOException e) {
                    System.err.printf("summarize :: Merging sorted output failed: %s\n", e);
                    return 7;
                }
            else {
                // Move the unmerged results out of the mainSortOutputDir
                // subdirectories to wrkDir.

                System.out.println("summarize :: Moving outputs from temporary directories...");
                t.start();

                try {
                    final FileSystem fs = wrkDir.getFileSystem(conf);
                    for (String lvl : levels) {
                        final FileStatus[] parts;

                        try {
                            parts = fs.globStatus(new Path(new Path(mainSortOutputDir, lvl + "[fr]"),
                                    "*-[0-9][0-9][0-9][0-9][0-9][0-9]"));
                        } catch (IOException e) {
                            System.err.printf("summarize :: Couldn't move level %s results: %s", lvl, e);
                            continue;
                        }

                        for (FileStatus part : parts) {
                            final Path path = part.getPath();
                            try {
                                fs.rename(path, new Path(wrkDir, path.getName()));
                            } catch (IOException e) {
                                System.err.printf("summarize :: Couldn't move '%s': %s", path, e);
                            }
                        }
                    }
                } catch (IOException e) {
                    System.err.printf("summarize :: Moving results failed: %s", e);
                }
                System.out.printf("summarize :: Moved in %d.%03d s.\n", t.stopS(), t.fms());
            }
            tryDelete(mainSortOutputDir);
        }
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    } catch (InterruptedException e) {
        throw new RuntimeException(e);
    }

    return 0;
}

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.FixMate.java

License:Open Source License

@Override
protected int run(CmdLineParser parser) {
    final List<String> args = parser.getRemainingArgs();
    if (args.isEmpty()) {
        System.err.println("fixmate :: WORKDIR not given.");
        return 3;
    }// ww  w  .j  a  v a  2 s .c om
    if (args.size() == 1) {
        System.err.println("fixmate :: INPATH not given.");
        return 3;
    }
    if (!cacheAndSetProperties(parser))
        return 3;

    final SAMFileReader.ValidationStringency stringency = Utils.toStringency(parser.getOptionValue(
            stringencyOpt, SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY.toString()), "fixmate");
    if (stringency == null)
        return 3;

    Path wrkDir = new Path(args.get(0));

    final List<String> strInputs = args.subList(1, args.size());
    final List<Path> inputs = new ArrayList<Path>(strInputs.size());
    for (final String in : strInputs)
        inputs.add(new Path(in));

    final Configuration conf = getConf();

    // Used by Utils.getMergeableWorkFile() to name the output files.
    final String intermediateOutName = (outPath == null ? inputs.get(0) : outPath).getName();
    conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName);

    if (stringency != null)
        conf.set(SAMHeaderReader.VALIDATION_STRINGENCY_PROPERTY, stringency.toString());

    final boolean globalSort = parser.getBoolean(sortOpt);
    if (globalSort)
        Utils.setHeaderMergerSortOrder(conf, SAMFileHeader.SortOrder.queryname);

    conf.setStrings(Utils.HEADERMERGER_INPUTS_PROPERTY, strInputs.toArray(new String[0]));

    final Timer t = new Timer();
    try {
        // Required for path ".", for example.
        wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir);

        if (globalSort)
            Utils.configureSampling(wrkDir, intermediateOutName, conf);

        final Job job = new Job(conf);

        job.setJarByClass(FixMate.class);
        job.setMapperClass(FixMateMapper.class);
        job.setReducerClass(FixMateReducer.class);

        if (!parser.getBoolean(noCombinerOpt))
            job.setCombinerClass(FixMateReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(SAMRecordWritable.class);

        job.setInputFormatClass(AnySAMInputFormat.class);
        job.setOutputFormatClass(CLIMergingAnySAMOutputFormat.class);

        for (final Path in : inputs)
            FileInputFormat.addInputPath(job, in);

        FileOutputFormat.setOutputPath(job, wrkDir);

        if (globalSort) {
            job.setPartitionerClass(TotalOrderPartitioner.class);

            System.out.println("fixmate :: Sampling...");
            t.start();

            InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job,
                    new InputSampler.RandomSampler<LongWritable, SAMRecordWritable>(0.01, 10000,
                            Math.max(100, reduceTasks)));

            System.out.printf("fixmate :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms());
        }

        job.submit();

        System.out.println("fixmate :: Waiting for job completion...");
        t.start();

        if (!job.waitForCompletion(verbose)) {
            System.err.println("fixmate :: Job failed.");
            return 4;
        }

        System.out.printf("fixmate :: Job complete in %d.%03d s.\n", t.stopS(), t.fms());

    } catch (IOException e) {
        System.err.printf("fixmate :: Hadoop error: %s\n", e);
        return 4;
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    } catch (InterruptedException e) {
        throw new RuntimeException(e);
    }

    if (outPath != null)
        try {
            Utils.mergeSAMInto(outPath, wrkDir, "", "", samFormat, conf, "fixmate");
        } catch (IOException e) {
            System.err.printf("fixmate :: Output merging failed: %s\n", e);
            return 5;
        }
    return 0;
}