Example usage for org.apache.hadoop.conf Configuration setLong

List of usage examples for org.apache.hadoop.conf Configuration setLong

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setLong.

Prototype

public void setLong(String name, long value) 

Source Link

Document

Set the value of the name property to a long.

Usage

From source file:org.apache.tez.tests.MiniTezClusterWithTimeline.java

License:Apache License

@Override
public void serviceInit(Configuration conf) throws Exception {
    conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_TEZ_FRAMEWORK_NAME);
    // Use libs from cluster since no build is available
    conf.setBoolean(TezConfiguration.TEZ_USE_CLUSTER_HADOOP_LIBS, true);
    // blacklisting disabled to prevent scheduling issues
    conf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false);
    if (conf.get(MRJobConfig.MR_AM_STAGING_DIR) == null) {
        conf.set(MRJobConfig.MR_AM_STAGING_DIR,
                new File(getTestWorkDir(), "apps_staging_dir" + Path.SEPARATOR).getAbsolutePath());
    }//w  w  w .  j a v a2 s.  c o m

    if (conf.get(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC) == null) {
        // nothing defined. set quick delete value
        conf.setLong(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, 0l);
    }

    File appJarLocalFile = new File(MiniTezClusterWithTimeline.APPJAR);

    if (!appJarLocalFile.exists()) {
        String message = "TezAppJar " + MiniTezClusterWithTimeline.APPJAR + " not found. Exiting.";
        LOG.info(message);
        throw new TezUncheckedException(message);
    } else {
        LOG.info("Using Tez AppJar: " + appJarLocalFile.getAbsolutePath());
    }

    FileSystem fs = FileSystem.get(conf);
    Path testRootDir = fs.makeQualified(new Path("target", getName() + "-tmpDir"));
    Path appRemoteJar = new Path(testRootDir, "TezAppJar.jar");
    // Copy AppJar and make it public.
    Path appMasterJar = new Path(MiniTezClusterWithTimeline.APPJAR);
    fs.copyFromLocalFile(appMasterJar, appRemoteJar);
    fs.setPermission(appRemoteJar, new FsPermission("777"));

    conf.set(TezConfiguration.TEZ_LIB_URIS, appRemoteJar.toUri().toString());
    LOG.info("Set TEZ-LIB-URI to: " + conf.get(TezConfiguration.TEZ_LIB_URIS));

    // VMEM monitoring disabled, PMEM monitoring enabled.
    conf.setBoolean(YarnConfiguration.NM_PMEM_CHECK_ENABLED, false);
    conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, false);

    conf.set(CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY, "000");

    try {
        Path stagingPath = FileContext.getFileContext(conf)
                .makeQualified(new Path(conf.get(MRJobConfig.MR_AM_STAGING_DIR)));
        /*
         * Re-configure the staging path on Windows if the file system is localFs.
         * We need to use a absolute path that contains the drive letter. The unit
         * test could run on a different drive than the AM. We can run into the
         * issue that job files are localized to the drive where the test runs on,
         * while the AM starts on a different drive and fails to find the job
         * metafiles. Using absolute path can avoid this ambiguity.
         */
        if (Path.WINDOWS) {
            if (LocalFileSystem.class.isInstance(stagingPath.getFileSystem(conf))) {
                conf.set(MRJobConfig.MR_AM_STAGING_DIR,
                        new File(conf.get(MRJobConfig.MR_AM_STAGING_DIR)).getAbsolutePath());
            }
        }
        FileContext fc = FileContext.getFileContext(stagingPath.toUri(), conf);
        if (fc.util().exists(stagingPath)) {
            LOG.info(stagingPath + " exists! deleting...");
            fc.delete(stagingPath, true);
        }
        LOG.info("mkdir: " + stagingPath);
        fc.mkdir(stagingPath, null, true);

        //mkdir done directory as well
        String doneDir = JobHistoryUtils.getConfiguredHistoryServerDoneDirPrefix(conf);
        Path doneDirPath = fc.makeQualified(new Path(doneDir));
        fc.mkdir(doneDirPath, null, true);
    } catch (IOException e) {
        throw new TezUncheckedException("Could not create staging directory. ", e);
    }
    conf.set(MRConfig.MASTER_ADDRESS, "test");

    //configure the shuffle service in NM
    conf.setStrings(YarnConfiguration.NM_AUX_SERVICES,
            new String[] { ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID });
    conf.setClass(
            String.format(YarnConfiguration.NM_AUX_SERVICE_FMT, ShuffleHandler.MAPREDUCE_SHUFFLE_SERVICEID),
            ShuffleHandler.class, Service.class);

    // Non-standard shuffle port
    conf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, 0);

    conf.setClass(YarnConfiguration.NM_CONTAINER_EXECUTOR, DefaultContainerExecutor.class,
            ContainerExecutor.class);

    // TestMRJobs is for testing non-uberized operation only; see TestUberAM
    // for corresponding uberized tests.
    conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
    super.serviceInit(conf);
}

From source file:org.broadinstitute.sting.gatk.hadoop.GATKJobClient.java

License:Open Source License

@Override
public int run(String[] argv) throws Exception {
    try {/*w  w w.jav a 2s .  c om*/
        Configuration conf;
        FileSystem srcFs, outFs, fs;
        Path inputPath = null, mergeOutFile, inputDir, partition = null, outputPath;
        int maxMapTasks, maxReduceTasks, max_splits = Integer.MAX_VALUE, granularity = 100;
        FileStatus[] content;
        ClusterStatus status;
        int numNodes, mapSlotsPerNode;
        long mapOutputBytes, iMBytesPerRed, mapOutBufSize, inputSize, cacheSize, startTime, blockSize, endTime,
                splitSize;
        float inputBufpcnt;
        FSDataOutputStream out;
        FSDataInputStream in;
        SAMFileReader fileReader;
        InputSampler.Sampler<LongWritable, SAMRecordWritable> sampler;
        double sampling_frequency = 0.01;

        // Job object can be used for Aligner job if enabled
        conf = getConf();
        Job job = new Job(conf);

        parseCommandLineArgs(argv, conf);

        maxMapTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxMapTasks();

        maxReduceTasks = new JobClient(new JobConf(conf)).getClusterStatus().getMaxReduceTasks();
        if (!noalign) {
            System.out.println("Starting Alignment Job");
            startTime = System.currentTimeMillis();

            status = new JobClient(new JobConf(conf)).getClusterStatus();
            numNodes = status.getTaskTrackers();
            // Job specific setting of number of Reducers..
            if (nReducers == 0)
                nReducers = numNodes;
            conf.setInt("mapred.reduce.tasks", nReducers);

            Path refPath = new Path(refFileLoc);
            fs = refPath.getFileSystem(conf);
            blockSize = fs.getFileStatus(refPath).getBlockSize();
            splitSize = Math.round(fs.getFileStatus(refPath).getBlockSize());

            if (reads_per_split == 0) {
                inputPath = new Path(readFile1);
                long readSize = (inputPath.getFileSystem(conf)).getFileStatus(inputPath).getLen();
                long numSplits = Math.round(readSize / splitSize);

                if (numSplits < maxMapTasks)
                    numSplits = maxMapTasks;

                if (numSplits < nReducers)
                    numSplits = nReducers;

                long numReads = Math.round(readSize / (long) fq_read_size);
                reads_per_split = numReads / numSplits;

                // Total Order Partitioner
                if ((double) reads_per_split <= (1 / sampling_frequency)) {
                    sampling_frequency = 1;
                    granularity = 1;
                } else if (((double) reads_per_split > (1 / sampling_frequency))
                        && ((double) reads_per_split <= (1 / sampling_frequency * 100))) {
                    sampling_frequency = 0.1;
                    granularity = 10;
                }
            }

            job.setJarByClass(GATKJobClient.class);
            job.setInputFormatClass(NLineXInputFormat.class);
            FileInputFormat.addInputPath(job, new Path(fqInput));
            FileOutputFormat.setOutputPath(job, new Path(BWAOutPath));

            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".amb#" + "ref.fa.amb"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".ann#" + "ref.fa.ann"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".bwt#" + "ref.fa.bwt"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".pac#" + "ref.fa.pac"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".sa#" + "ref.fa.sa"), job.getConfiguration());
            if (!is_azure) {
                DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa"), job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".rbwt#" + "ref.fa.rbwt"),
                        job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".rpac#" + "ref.fa.rpac"),
                        job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".rsa#" + "ref.fa.rsa"),
                        job.getConfiguration());
            } else {
                DistributedCache.addCacheFile(new URI(bwa_binary_loc + "#" + "bwa.exe"),
                        job.getConfiguration());
            }
            DistributedCache.createSymlink(job.getConfiguration());

            // Setting local.cache.size - Add up the size of the files
            // distributed through the cache

            cacheSize = fs.getFileStatus(new Path(refFileLoc)).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".amb")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".ann")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".bwt")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".pac")).getLen()
                    + fs.getFileStatus(new Path(refFileLoc + ".sa")).getLen();
            if (!is_azure) {
                cacheSize = cacheSize + fs.getFileStatus(new Path(refFileLoc + ".rbwt")).getLen()
                        + fs.getFileStatus(new Path(refFileLoc + ".rpac")).getLen()
                        + fs.getFileStatus(new Path(refFileLoc + ".rsa")).getLen();
            }

            if (cacheSize > 8 * 1024 * 1024 * 1024) {
                conf.setLong("local.cache.size", cacheSize + (1 * 1024 * 1024 * 1024));
            }

            conf.setLong("mapred.task.timeout", 86400000L); // 24 hrs..
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            conf.setLong("mapred.line.input.format.linespermap", reads_per_split * 4);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);
            // conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            // conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1

            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(Text.class);
            job.setPartitionerClass(BWAPartitioner.class);
            job.setReducerClass(BWAReducer.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);

            if (job.waitForCompletion(true)) {
                System.out.println("BWA Alignment done");
            }

            content = fs.listStatus(new Path(BWAOutPath));

            for (int i = 0; i < content.length; i++) {
                if (!((content[i].getPath().getName()).endsWith(".bam"))
                        && !((content[i].getPath().getName()).startsWith("_"))) {
                    fs.delete(content[i].getPath(), false);
                }
            }
            endTime = System.currentTimeMillis();
            System.out.println("BWA Alignment took: " + (endTime - startTime));
            startTime = System.currentTimeMillis();
            System.out.println("Starting Splitting BAM Indexing Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            inputPath = new Path(BWAOutPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);

            Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir1");
            FileOutputFormat.setOutputPath(job, output);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setInt("mapred.reduce.tasks", 0);
            conf.setInt("gatk.hadoop.granularity", granularity);
            conf.setBoolean("gatk.hadoop.issindex", true);
            conf.setBoolean("gatk.hadoop.isindex", false);
            conf.setBoolean("gatk.hadoop.ismarkdup", false);

            job.setMapperClass(IndexMapper.class);
            job.setMapOutputKeyClass(NullWritable.class);
            job.setMapOutputValueClass(NullWritable.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("SplittingBAM Indexing job done");
            }
            output.getFileSystem(conf).delete(output, true);

            endTime = System.currentTimeMillis();
            System.out.println("Splitting BAM Indexing took: " + (endTime - startTime));

            startTime = System.currentTimeMillis();
            System.out.println("Starting Sort Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();

            if (norealign && nomarkdup && noqrecab && novariant && !nomresults)
                conf.setBoolean("gatk.hadoop.ismerge", true);
            inputPath = new Path(BWAOutPath);
            FileInputFormat.addInputPath(job, inputPath);
            FileOutputFormat.setOutputPath(job, new Path(SortBWAOutPath));
            job.setInputFormatClass(ContigInputFormat.class);
            job.setPartitionerClass(ContigPartitioner.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());

            fs = inputPath.getFileSystem(conf);
            content = fs.listStatus(inputPath);
            for (int i = 0; i < content.length; i++) {
                if (content[i].getPath().getName().endsWith(".bam")) {
                    in = fs.open(content[i].getPath());
                    List<SAMSequenceRecord> sequences = (new SAMFileReader(in).getFileHeader())
                            .getSequenceDictionary().getSequences();
                    conf.setInt("mapred.reduce.tasks", sequences.size());

                    break;
                }
            }

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);
            //conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            //conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            //conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1

            job.setReducerClass(SortReducer.class);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(SAMRecordWritable.class);
            job.setOutputFormatClass(SortOutputFormat.class);

            if (job.waitForCompletion(true)) {
                System.out.println("Sort completed successfully");
            }
            endTime = System.currentTimeMillis();
            System.out.println("Sort job took: " + (endTime - startTime));
        }

        if (!norealign) {
            if (!noalign)
                BAMInputPath = SortBWAOutPath;

            startTime = System.currentTimeMillis();
            System.out.println("Starting Indexing Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);
            Path output = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir2");
            FileOutputFormat.setOutputPath(job, output);

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            conf.setInt("mapred.reduce.tasks", 0);
            conf.setBoolean("gatk.hadoop.isindex", true);
            conf.setBoolean("gatk.hadoop.issindex", true);
            conf.setBoolean("gatk.hadoop.ismarkdup", false);

            job.setMapperClass(IndexMapper.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            if (job.waitForCompletion(true)) {
                System.out.println("Indexing job done");
            }
            output.getFileSystem(conf).delete(output, true);

            endTime = System.currentTimeMillis();
            System.out.println("Indexing job took: " + (endTime - startTime));

            startTime = System.currentTimeMillis();
            System.out.println("Starting Realigner Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();

            inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);

            job.setInputFormatClass(BAMInputFormat.class);

            srcFs = new Path(outputDir).getFileSystem(conf);
            if (!srcFs.mkdirs(new Path(outputDir + Path.SEPARATOR + "Partition")))
                System.out.println("mkdir failed");
            inputDir = new Path(outputDir + Path.SEPARATOR + "Partition");
            inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
            partition = new Path(inputDir, "_partition");
            job.setPartitionerClass(TotalOrderPartitioner.class);
            TotalOrderPartitioner.setPartitionFile(conf, partition);

            try {
                URI partitionURI = new URI(partition.toString() + "#_partition");
                DistributedCache.addCacheFile(partitionURI, conf);
            } catch (URISyntaxException e) {
                assert false;
            }

            if (nReducers == 0) {
                if (!nomarkdup || !noqrecab || !novariant) {
                    conf.setInt("mapred.reduce.tasks", maxMapTasks);
                } else {
                    conf.setInt("mapred.reduce.tasks", Math.max(1, maxReduceTasks * 9 / 10));
                }
            } else {
                conf.setInt("mapred.reduce.tasks", nReducers);
            }

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);
            conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1

            if (nomarkdup && noqrecab && novariant && !nomresults)
                conf.setBoolean("gatk.hadoop.ismerge", true);
            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            job.setMapperClass(IndelMapper.class);
            job.setReducerClass(SortReducer.class);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(SAMRecordWritable.class);
            job.setOutputFormatClass(SortOutputFormat.class);
            FileOutputFormat.setOutputPath(job, new Path(IndelOutPath));

            sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency,
                    max_splits);
            InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler);
            job.setInputFormatClass(LociInputFormat.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());
            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("Indel realignment done");
            }
            endTime = System.currentTimeMillis();
            System.out.println("Indel Realigner took: " + (endTime - startTime));
        }

        if (!nomarkdup || !noqrecab || !novariant) {
            /* 
             * MarkDuplicate and Indexing Job 
             * FixMateInformation is not required as it is handled
             * automatically by GATK after IndelRealignment.
             */
            System.out.println("Starting MarkDup/Indexing job");
            startTime = System.currentTimeMillis();
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else
                inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);

            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            conf.setInt("mapred.reduce.tasks", 0);
            if (!nomarkdup) {
                System.out.println("Starting MarkDuplicates job");
                conf.setBoolean("gatk.hadoop.ismarkdup", true);
                FileOutputFormat.setOutputPath(job, new Path(RmdupOutPath));
            }
            if (!noqrecab || !novariant) {
                conf.setBoolean("gatk.hadoop.issindex", true);
                conf.setBoolean("gatk.hadoop.isindex", true);
                if (nomarkdup) {
                    System.out.println("Starting Indexing job");
                    FileOutputFormat.setOutputPath(job,
                            new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3"));
                }
            }
            job.setMapperClass(IndexMapper.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            if (job.waitForCompletion(true)) {
                System.out.println("Markdup/Indexing job done !!!");
            }
            Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir3");
            fs = toDelete.getFileSystem(conf);
            if (fs.exists(toDelete)) {
                fs.delete(toDelete, true);
            }

            if (!nomarkdup) {
                Path rmdupOutPath = new Path(RmdupOutPath);
                fs = rmdupOutPath.getFileSystem(conf);
                content = fs.listStatus(rmdupOutPath);

                for (int i = 0; i < content.length; i++) {
                    if ((content[i].getPath().getName()).startsWith("part")) {
                        fs.delete(content[i].getPath(), false);
                    }
                }
                endTime = System.currentTimeMillis();
                System.out.println("MarkDuplicates took: " + (endTime - startTime));
            } else {
                endTime = System.currentTimeMillis();
                System.out.println("Indexing took: " + (endTime - startTime));
            }
        }

        if (!noqrecab) {
            startTime = System.currentTimeMillis();
            System.out.println("Starting Recal - Count Covariates Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            if (!nomarkdup)
                inputPath = new Path(RmdupOutPath);
            else if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else
                inputPath = new Path(BAMInputPath);
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(LociInputFormat.class);

            conf.setLong("local.cache.size", 20106127360L);
            conf.setInt("mapred.reduce.tasks", 1);
            conf.setLong("mapred.task.timeout", 86400000L);
            conf.set("gatk.hadoop.outputpath", outputDir);
            // conf.setInt("mapred.tasktracker.map.tasks.maximum", 1);
            // conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 1);
            // conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            // conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
            // conf.setBoolean("mapred.compress.map.output", true); // Default compression ration 3.5:1

            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            job.setMapperClass(RecalCovMapper.class);
            job.setCombinerClass(RecalCovCombiner.class);
            job.setReducerClass(RecalCovReducer.class);
            job.setMapOutputKeyClass(Text.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(Text.class);
            FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "CovariateOut"));

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            // Standard inputs
            DistributedCache.addCacheFile(new URI(knownSitesLoc + "#" + "ref.vcf"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(knownSitesLoc + ".idx#" + "ref.vcf.idx"),
                    job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());

            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("CountCovariates done");
            }
            endTime = System.currentTimeMillis();
            System.out.println("CountCovariates took: " + (endTime - startTime));
        }

        if (!noqrecab || !novariant) {
            startTime = System.currentTimeMillis();
            System.out.println("Starting Table Recalibration / Unified Genotyper Job");
            if (!nomarkdup)
                inputPath = new Path(RmdupOutPath);
            else if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else
                inputPath = new Path(BAMInputPath);
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            FileInputFormat.addInputPath(job, inputPath);

            if (!noqrecab) {
                conf.setBoolean("gatk.hadoop.recab", true);
                if (norealign) {
                    job.setInputFormatClass(BAMInputFormat.class);
                    srcFs = new Path(outputDir).getFileSystem(conf);
                    if (!srcFs.mkdirs(new Path(outputDir + "/" + "Partition")))
                        System.out.println("mkdir failed");
                } else {
                    job.setInputFormatClass(LociInputFormat.class);
                }
                inputDir = new Path(outputDir + "/" + "Partition");
                inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
                partition = new Path(inputDir, "_partition");
                job.setPartitionerClass(TotalOrderPartitioner.class);
                TotalOrderPartitioner.setPartitionFile(conf, partition);
                try {
                    URI partitionURI = new URI(partition.toString() + "#_partition");
                    DistributedCache.addCacheFile(partitionURI, conf);
                } catch (URISyntaxException e) {
                    assert false;
                }

                if (nReducers == 0) {
                    conf.setInt("mapred.reduce.tasks", maxMapTasks);
                } else {
                    conf.setInt("mapred.reduce.tasks", nReducers);
                }
                conf.setBoolean("mapred.compress.map.output", true); // Default compression ratio 3.5:1
                conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
                if (!nomresults)
                    conf.setBoolean("gatk.hadoop.ismerge", true);
                job.setReducerClass(SortReducer.class);
                job.setMapOutputKeyClass(LongWritable.class);
                job.setOutputKeyClass(NullWritable.class);
                job.setOutputValueClass(SAMRecordWritable.class);
                job.setOutputFormatClass(SortOutputFormat.class);
                FileOutputFormat.setOutputPath(job, new Path(RecalOutPath));
            } else {
                job.setInputFormatClass(LociInputFormat.class);
                conf.setInt("mapred.reduce.tasks", 0);
                FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4"));
            }

            job.setMapperClass(RecalMapper.class);
            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setInt("dfs.datanode.socket.write.timeout", 600000);
            conf.setInt("dfs.socket.timeout", 600000);

            conf.set("gatk.hadoop.outputpath", outputDir);
            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            if (!novariant) {
                conf.setBoolean("gatk.hadoop.variant", true);
                if (!nofvariant)
                    conf.setBoolean("gatk.hadoop.fvariant", true);
                conf.setInt("gatk.hadoop.nthreads", nThreads);
                conf.setBoolean("gatk.hadoop.xvariant", xVariantCall);
            }

            if (!noqrecab && norealign) {
                sampler = new InputSampler.IntervalSampler<LongWritable, SAMRecordWritable>(sampling_frequency,
                        max_splits);
                InputSampler.<LongWritable, SAMRecordWritable>writePartitionFile(job, sampler);
                job.setInputFormatClass(LociInputFormat.class);
            }

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            // Standard inputs
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());

            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("TableRecalibration Job done !!");
            }
            endTime = System.currentTimeMillis();
            Path toDelete = new Path(outputDir + Path.SEPARATOR + "DeleteThisDir4");
            fs = toDelete.getFileSystem(conf);
            if (fs.exists(toDelete)) {
                fs.delete(toDelete, true);
            }
            System.out.println("TableRecalibraion / UnifiedGenotyper job took: " + (endTime - startTime));
        }
        if (!novariant && !nomresults) {
            startTime = System.currentTimeMillis();
            System.out.println("Merge Variant Job");
            job = new Job();
            job.setJarByClass(GATKJobClient.class);
            conf = job.getConfiguration();
            inputPath = new Path(outputDir + Path.SEPARATOR + "VariantOut");
            FileInputFormat.addInputPath(job, inputPath);
            job.setInputFormatClass(WholeFileInputFormat.class);

            conf.setInt("mapred.reduce.tasks", 1);
            conf.setLong("mapred.task.timeout", 86400000L);
            conf.setBoolean("mapred.map.tasks.speculative.execution", false);
            conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

            conf.setBoolean("gatk.hadoop", true);
            conf.setBoolean("gatk.hadoop.isazure", is_azure);
            job.setReducerClass(VariantReducer.class);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(NullWritable.class);
            FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalVariantOut"));

            DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
            // Standard inputs
            DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"), job.getConfiguration());
            DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"), job.getConfiguration());

            DistributedCache.createSymlink(job.getConfiguration());

            if (job.waitForCompletion(true)) {
                System.out.println("Merge Variants done");
            }
            endTime = System.currentTimeMillis();
            System.out.println("MergeVariant job took: " + (endTime - startTime));

            if (xVariantCall && !novariant && !nomresults) {
                startTime = System.currentTimeMillis();

                System.out.println("Merge INDEL Variant Job");
                job = new Job();
                job.setJarByClass(GATKJobClient.class);
                conf = job.getConfiguration();
                inputPath = new Path(outputDir + Path.SEPARATOR + "IVariantOut");
                FileInputFormat.addInputPath(job, inputPath);
                job.setInputFormatClass(WholeFileInputFormat.class);

                conf.setInt("mapred.reduce.tasks", 1);
                conf.setLong("mapred.task.timeout", 86400000L);
                conf.setBoolean("mapred.map.tasks.speculative.execution", false);
                conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

                conf.setBoolean("gatk.hadoop", true);
                conf.setBoolean("gatk.hadoop.isazure", is_azure);
                job.setReducerClass(VariantReducer.class);
                job.setMapOutputKeyClass(LongWritable.class);
                job.setMapOutputValueClass(Text.class);
                job.setOutputKeyClass(NullWritable.class);
                job.setOutputValueClass(NullWritable.class);
                FileOutputFormat.setOutputPath(job, new Path(outputDir + Path.SEPARATOR + "FinalIVariantOut"));

                DistributedCache.addArchiveToClassPath(new Path(gatk_binary_loc), job.getConfiguration());
                // Standard inputs
                DistributedCache.addCacheFile(new URI(refFileLoc + "#" + "ref.fa"), job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileLoc + ".fai#" + "ref.fa.fai"),
                        job.getConfiguration());
                DistributedCache.addCacheFile(new URI(refFileName + ".dict#" + "ref.dict"),
                        job.getConfiguration());

                DistributedCache.createSymlink(job.getConfiguration());

                if (job.waitForCompletion(true)) {
                    System.out.println("Merge INDEL Variants done");
                }
                endTime = System.currentTimeMillis();
                System.out.println("MergeINDELVariant job took: " + (endTime - startTime));
            }
        }

        if (!nomresults) {
            startTime = System.currentTimeMillis();
            System.out.println("Starting Merge BAM Job");

            outputPath = new Path(FinalBAMPath);
            outFs = outputPath.getFileSystem(conf);

            if (!outFs.mkdirs(outputPath))
                System.out.println("mkdir failed");
            // Currently no support to merge output from MarkDuplicates 
            // from Job Client. Need to have a separate MR job for it.
            if (!noqrecab)
                inputPath = new Path(RecalOutPath);
            else if (!norealign)
                inputPath = new Path(IndelOutPath);
            else if (!noalign)
                inputPath = new Path(SortBWAOutPath);
            else if (!nomarkdup)
                throw new Exception("Merge not implemented MarkDuplicates output.");
            else if (noqrecab && noalign && norealign && novariant && nomarkdup && nofvariant)
                inputPath = new Path(BAMInputPath);

            fs = inputPath.getFileSystem(conf);

            content = fs.listStatus(inputPath);
            mergeOutFile = new Path(FinalBAMPath, "GATKAnalysisResult.bam");

            Path p = null;
            int nfiles = 0;
            for (int i = 0; i < content.length; i++) {
                p = content[i].getPath();
                ++nfiles;
            }

            if (nfiles == 1) {
                boolean rename = fs.rename(p, mergeOutFile);
            } else {
                out = outFs.create(mergeOutFile, true);

                for (int i = 0; i < content.length; i++) {
                    p = content[i].getPath();
                    if ((p.getName()).endsWith(".bam")) {
                        in = fs.open(p);
                        IOUtils.copyBytes(in, out, conf, false);
                        in.close();
                    }
                }

                out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
                out.close();
            }

            endTime = System.currentTimeMillis();
            System.out.println("Final Merge took: " + (endTime - startTime));
        }
        System.out.println("JobCompleted");
    } catch (IOException e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    } catch (InterruptedException e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    } catch (ClassNotFoundException e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    } catch (Exception e) {
        System.err.printf("Hadoop Error : %s\n", e);
        return -1;
    }
    return 0;
}

From source file:org.lilyproject.hadooptestfw.HBaseTestingUtilityFactory.java

License:Apache License

/**
 * Creates an HBaseTestingUtility with settings applied such that everything will be stored below the
 * supplied directory and makes (to some extent) use of standard port numbers.
 *
 * @param conf HBase conf to use, as created by HBaseConfiguration.create().
 * @param tmpDir directory under which data of dfs, zookeeper, mr, ... will be stored
 * @param clearData can data be cleared (at startup or shutdown), use true unless you need the data from a previous
 *                  run/*from   ww  w  . j  av  a  2 s  . c o m*/
 */
public static HBaseTestingUtility create(Configuration conf, File tmpDir, boolean clearData)
        throws IOException {

    // This location will be used for dfs, zookeeper, ...
    conf.set(TEST_DIR_KEY, createSubDir(tmpDir, "hbase-test-util"));

    // This property is picked up by our fork of MiniMRCluster (the default implementation was hardcoded
    // to use build/test/mapred/local)
    System.setProperty("mapred.local.dir", createSubDir(tmpDir, "mapred-local"));

    conf.set("mapred.local.dir", createSubDir(tmpDir, "mapred-local"));

    // Properties used for MiniMRCluster
    conf.set("hadoop.log.dir", createSubDir(tmpDir, "hadoop-logs"));
    conf.set("hadoop.tmp.dir", createSubDir(tmpDir, "mapred-output"));

    conf.set("mapred.system.dir", "/tmp/hadoop/mapred/system");
    conf.set("mapreduce.jobtracker.staging.root.dir", "/tmp/hadoop/mapred/staging");

    // Only use one MR child VM, should be lighter on developer machines
    conf.set("mapred.tasktracker.map.tasks.maximum", "1");

    // Force default port numbers
    conf.set("hbase.master.info.port", "60010");
    conf.set("hbase.regionserver.info.port", "60030");

    // Allow more clients to connect concurrently (HBase default is 10)
    conf.set("hbase.regionserver.handler.count", "30");

    // Allow more clients to connect concurrently to hdfs (default is 3)
    conf.set("dfs.datanode.handler.count", "6");

    // Generic performance related settings
    conf.set("io.file.buffer.size", "65536");
    conf.set("hbase.hregion.memstore.flush.size", "268435456");

    // Disable the automatic closing of Hadoop FileSystem objects by its shutdown hook.
    // Otherwise, when stopping 'launch-test-lily' (LilyLauncher), the shutdown hook closes the filesystem
    // before HBase had the opportunity to flush its data. This then leads to (possibly long) recoveries
    // on the next startup (and even then, I've seen data loss, maybe sync is not active for the mini cluster?).
    conf.set("fs.automatic.close", "false");

    // Replication parameters needed for the SEP
    conf.set("hbase.replication", "true");
    conf.setFloat("replication.source.ratio", 1.0f);
    conf.set("replication.source.nb.capacity", "200");
    conf.set("replication.replicationsource.implementation", "com.ngdata.sep.impl.SepReplicationSource");

    // make replication react a little quicker
    conf.setLong("replication.source.sleepforretries", 200);

    // make retries in ZooKeeper a little quicker
    // This was added with CDH 4.2, where on shutdown HBase's snapshot manager closed a zookeeper
    // connection which later on was still used by another component, which then got into a retry loop,
    // leading to a slow shutdown.
    conf.setInt("zookeeper.recovery.retry.intervalmill", 100);

    return new HBaseTestingUtility(conf, clearData);
}

From source file:org.mrgeo.data.vector.VectorInputFormatContext.java

License:Apache License

public void save(final Configuration conf) {
    conf.setInt(INPUTS_COUNT, inputs.size());
    int inputIndex = 0;
    for (String input : inputs) {
        conf.set(INPUTS_PREFIX + inputIndex, input);
        inputIndex++;//from   w ww.  j  a v  a2  s  .  c  o m
    }
    conf.setLong(FEATURE_COUNT_KEY, featureCount);
    conf.setInt(MIN_FEATURES_PER_SPLIT_KEY, minFeaturesPerSplit);
    conf.set(PROVIDER_PROPERTY_KEY, ProviderProperties.toDelimitedString(inputProviderProperties));
}

From source file:org.mrgeo.vector.mrsvector.OSMTileIngester.java

License:Apache License

private boolean buildTiles() {
    try {//  ww w.  j ava2  s.  com
        final Job job = new Job(config);
        HadoopUtils.setJar(job, this.getClass());

        final String now = new SimpleDateFormat("yyyy-MM-dd'T'HH-mm-ss").format(new Date());

        final String jobName = "BuildTiles_" + now + "_" + UUID.randomUUID().toString();
        job.setJobName(jobName);

        final Configuration conf = job.getConfiguration();

        conf.setInt(ZOOMLEVEL, zoomlevel);
        final int tilesize = Integer.parseInt(MrGeoProperties.getInstance()
                .getProperty(MrGeoConstants.MRGEO_MRS_TILESIZE, MrGeoConstants.MRGEO_MRS_TILESIZE_DEFAULT));
        conf.setInt(TILESIZE, tilesize);
        conf.set(OUTPUT, tmpDir.toString());

        conf.setInt(GRANULATIRY, granularity);
        conf.setLong(LATOFFSET, latOffset);
        conf.setLong(LONOFFSET, lonOffset);

        job.setInputFormatClass(SequenceFileInputFormat.class);

        final Path tilesPath = new Path(tmpDir, TILEIDS + "/*/part*");
        HadoopVectorUtils.addInputPath(job, tilesPath);

        job.setReducerClass(ProcessTilesReducer.class);

        final Path output = new Path(tmpDir, VECTORTILES);

        HadoopFileUtils.delete(output);

        MrsImageOutputFormatProvider ofProvider = MrsImageDataProvider.setupMrsPyramidOutputFormat(job,
                output.toString(), datasetBounds, zoomlevel, tilesize, protectionLevel, providerProperties);
        //FileOutputFormat.setOutputPath(job, outputWithZoom);

        job.setMapOutputKeyClass(TileIdWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(TileIdWritable.class);
        job.setOutputValueClass(VectorTileWritable.class);

        try {
            job.submit();
            final boolean success = job.waitForCompletion(true);

            if (success) {
                ofProvider.teardown(job);
                MrsVectorPyramid.calculateMetadata(output.toString(), zoomlevel, tilesize, datasetBounds,
                        protectionLevel);
                return true;
            }

        } catch (final InterruptedException e) {
            e.printStackTrace();
        } catch (final ClassNotFoundException e) {
            e.printStackTrace();
        }
    } catch (final IOException e) {
        e.printStackTrace();
    }
    return false;

}

From source file:org.mrgeo.vector.mrsvector.OSMTileIngester.java

License:Apache License

private boolean processNodes() {
    try {/*from  w  ww  .ja  va2  s  . c o  m*/
        final Job job = new Job(config);
        HadoopUtils.setJar(job, this.getClass());

        final String now = new SimpleDateFormat("yyyy-MM-dd'T'HH-mm-ss").format(new Date());

        final String jobName = "ProcesNodes_" + now + "_" + UUID.randomUUID().toString();
        job.setJobName(jobName);

        final Configuration conf = job.getConfiguration();

        conf.setInt(ZOOMLEVEL, zoomlevel);
        conf.setInt(TILESIZE, Integer.parseInt(MrGeoProperties.getInstance()
                .getProperty(MrGeoConstants.MRGEO_MRS_TILESIZE, MrGeoConstants.MRGEO_MRS_TILESIZE_DEFAULT)));
        conf.set(OUTPUT, tmpDir.toString());

        conf.setInt(GRANULATIRY, granularity);
        conf.setLong(LATOFFSET, latOffset);
        conf.setLong(LONOFFSET, lonOffset);

        job.setInputFormatClass(SequenceFileInputFormat.class);

        final Path nodesPath = new Path(tmpDir, NODES);
        HadoopVectorUtils.addInputPath(job, nodesPath);

        job.setReducerClass(ProcessNodesReducer.class);

        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        final Path output = new Path(tmpDir, TILEIDS + "/" + NODES);
        HadoopFileUtils.delete(output);
        FileOutputFormat.setOutputPath(job, output);

        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(TileIdWritable.class);
        job.setOutputValueClass(Text.class);

        try {
            job.submit();
            final boolean success = job.waitForCompletion(true);

            if (success) {
                return true;
            }

        } catch (final InterruptedException e) {
            e.printStackTrace();
        } catch (final ClassNotFoundException e) {
            e.printStackTrace();
        }
    } catch (final IOException e) {
        e.printStackTrace();
    }
    return false;
}

From source file:org.mrgeo.vector.mrsvector.OSMTileIngester.java

License:Apache License

private boolean processRelations() {
    try {//  w w  w  .java  2 s.c om
        int runCnt = 1;

        while (true) {
            final Job job = new Job(config);
            HadoopUtils.setJar(job, this.getClass());

            final String now = new SimpleDateFormat("yyyy-MM-dd'T'HH-mm-ss").format(new Date());

            final String jobName = "ProcesRelations_" + runCnt + "_" + now + "_" + UUID.randomUUID().toString();
            job.setJobName(jobName);

            final Configuration conf = job.getConfiguration();

            conf.setInt(ZOOMLEVEL, zoomlevel);
            conf.setInt(TILESIZE, Integer.parseInt(MrGeoProperties.getInstance().getProperty(
                    MrGeoConstants.MRGEO_MRS_TILESIZE, MrGeoConstants.MRGEO_MRS_TILESIZE_DEFAULT)));
            conf.set(OUTPUT, tmpDir.toString());

            conf.setInt(GRANULATIRY, granularity);
            conf.setLong(LATOFFSET, latOffset);
            conf.setLong(LONOFFSET, lonOffset);
            conf.setInt(RELATION_RUN, runCnt);

            job.setInputFormatClass(SequenceFileInputFormat.class);

            final Path relationsPath;
            if (runCnt <= 1) {
                relationsPath = new Path(tmpDir, RELATIONS);
            } else {
                relationsPath = new Path(tmpDir, RELATIONS + "_" + (runCnt - 1));
            }
            HadoopVectorUtils.addInputPath(job, relationsPath);

            job.setOutputFormatClass(SequenceFileOutputFormat.class);

            final Path output = new Path(tmpDir, TILEIDS + "/" + RELATIONS + "_" + runCnt);
            HadoopFileUtils.delete(output);
            FileOutputFormat.setOutputPath(job, output);

            job.setReducerClass(ProcessRelationsReducer.class);

            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(Text.class);

            job.setOutputKeyClass(TileIdWritable.class);
            job.setOutputValueClass(Text.class);

            boolean success = false;
            try {
                job.submit();
                success = job.waitForCompletion(true);
            } catch (final InterruptedException e) {
                e.printStackTrace();
            } catch (final ClassNotFoundException e) {
                e.printStackTrace();
            }

            if (success) {

                final Path rp = new Path(tmpDir, RELATIONS + "_" + runCnt);

                // did we make a relations file?
                if (!HadoopFileUtils.exists(rp)) {
                    return true;
                }
            }
            runCnt++;

            if (runCnt > 5) {
                return true;
            }
        }
    } catch (final IOException e) {
        e.printStackTrace();
    }
    return false;
}

From source file:org.mrgeo.vector.mrsvector.OSMTileIngester.java

License:Apache License

private boolean processWays() {
    try {//www  . j  a  va  2  s  .  co m
        final Job job = new Job(config);
        HadoopUtils.setJar(job, this.getClass());

        final String now = new SimpleDateFormat("yyyy-MM-dd'T'HH-mm-ss").format(new Date());

        final String jobName = "ProcesWays_" + now + "_" + UUID.randomUUID().toString();
        job.setJobName(jobName);

        final Configuration conf = job.getConfiguration();

        conf.setInt(ZOOMLEVEL, zoomlevel);
        conf.setInt(TILESIZE, Integer.parseInt(MrGeoProperties.getInstance()
                .getProperty(MrGeoConstants.MRGEO_MRS_TILESIZE, MrGeoConstants.MRGEO_MRS_TILESIZE_DEFAULT)));
        conf.set(OUTPUT, tmpDir.toString());

        conf.setInt(GRANULATIRY, granularity);
        conf.setLong(LATOFFSET, latOffset);
        conf.setLong(LONOFFSET, lonOffset);

        job.setInputFormatClass(SequenceFileInputFormat.class);

        final Path waysPath = new Path(tmpDir, WAYS);
        HadoopVectorUtils.addInputPath(job, waysPath);

        job.setReducerClass(ProcessWaysReducer.class);

        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        final Path output = new Path(tmpDir, TILEIDS + "/" + WAYS);
        HadoopFileUtils.delete(output);
        FileOutputFormat.setOutputPath(job, output);

        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(TileIdWritable.class);
        job.setOutputValueClass(Text.class);

        try {
            job.submit();
            final boolean success = job.waitForCompletion(true);

            if (success) {
                return true;
            }
        } catch (final InterruptedException e) {
            e.printStackTrace();
        } catch (final ClassNotFoundException e) {
            e.printStackTrace();
        }
    } catch (final IOException e) {
        e.printStackTrace();
    }
    return false;
}

From source file:org.smartfrog.services.hadoop.mapreduce.terasort.TeraGenJob.java

License:Apache License

static void setNumberOfRows(Configuration job, long numRows) {
    job.setLong("terasort.num-rows", numRows);
}

From source file:org.unigram.likelike.lsh.function.TestMinWiseFunction.java

License:Apache License

private MinWiseFunction createFunction(int depth, long seed) {
    Configuration conf = new Configuration();
    conf.setLong(SelectClustersMapper.MINWISE_HASH_SEEDS, seed);
    conf.setInt(LikelikeConstants.FEATURE_DEPTH, depth);
    return new MinWiseFunction(conf);
}