Example usage for org.apache.hadoop.conf Configuration setBoolean

List of usage examples for org.apache.hadoop.conf Configuration setBoolean

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setBoolean.

Prototype

public void setBoolean(String name, boolean value) 

Source Link

Document

Set the value of the name property to a boolean.

Usage

From source file:edu.umn.cs.spatialHadoop.operations.Indexer.java

License:Open Source License

/***
 * Create a partitioner for a particular job
 * @param in//w w  w. jav a2  s .co  m
 * @param out
 * @param job
 * @param partitionerName
 * @return
 * @throws IOException
 */
public static Partitioner createPartitioner(Path[] ins, Path out, Configuration job, String partitionerName)
        throws IOException {
    try {
        Partitioner partitioner = null;
        Class<? extends Partitioner> partitionerClass = PartitionerClasses.get(partitionerName.toLowerCase());
        if (partitionerClass == null) {
            // Try to parse the name as a class name
            try {
                partitionerClass = Class.forName(partitionerName).asSubclass(Partitioner.class);
            } catch (ClassNotFoundException e) {
                throw new RuntimeException("Unknown index type '" + partitionerName + "'");
            }
        }

        if (PartitionerReplicate.containsKey(partitionerName.toLowerCase())) {
            boolean replicate = PartitionerReplicate.get(partitionerName.toLowerCase());
            job.setBoolean("replicate", replicate);
        }
        partitioner = partitionerClass.newInstance();

        long t1 = System.currentTimeMillis();
        final Rectangle inMBR = (Rectangle) OperationsParams.getShape(job, "mbr");
        // Determine number of partitions
        long inSize = 0;
        for (Path in : ins) {
            inSize += FileUtil.getPathSize(in.getFileSystem(job), in);
        }
        long estimatedOutSize = (long) (inSize * (1.0 + job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.1f)));
        FileSystem outFS = out.getFileSystem(job);
        long outBlockSize = outFS.getDefaultBlockSize(out);
        int numPartitions = Math.max(1, (int) Math.ceil((float) estimatedOutSize / outBlockSize));
        LOG.info("Partitioning the space into " + numPartitions + " partitions");

        final Vector<Point> sample = new Vector<Point>();
        float sample_ratio = job.getFloat(SpatialSite.SAMPLE_RATIO, 0.01f);
        long sample_size = job.getLong(SpatialSite.SAMPLE_SIZE, 100 * 1024 * 1024);

        LOG.info("Reading a sample of " + (int) Math.round(sample_ratio * 100) + "%");
        ResultCollector<Point> resultCollector = new ResultCollector<Point>() {
            @Override
            public void collect(Point p) {
                sample.add(p.clone());
            }
        };
        OperationsParams params2 = new OperationsParams(job);
        params2.setFloat("ratio", sample_ratio);
        params2.setLong("size", sample_size);
        params2.setClass("outshape", Point.class, Shape.class);
        Sampler.sample(ins, resultCollector, params2);
        long t2 = System.currentTimeMillis();
        System.out.println("Total time for sampling in millis: " + (t2 - t1));
        LOG.info("Finished reading a sample of " + sample.size() + " records");

        partitioner.createFromPoints(inMBR, sample.toArray(new Point[sample.size()]), numPartitions);

        return partitioner;
    } catch (InstantiationException e) {
        e.printStackTrace();
        return null;
    } catch (IllegalAccessException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:edu.usc.pgroup.louvain.hadoop.LouvainMR.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    int displayLevel = Integer.parseInt(args[2]);

    boolean v = false;
    if (args.length > 3) {
        v = Boolean.parseBoolean(args[3]);
    }//  w ww .j  ava  2s .c o  m

    conf.setInt(DISPLAY_LEVEL, displayLevel);
    conf.setBoolean(VERBOSE, v);
    conf.set(OUT_PATH, args[1]);

    Job job = new Job(conf);
    job.setJobName(TestJob.class.getName());
    job.setJarByClass(TestJob.class);
    job.setMapperClass(MapCommunity.class);
    job.setReducerClass(ReduceCommunity.class);

    // Hello there ZipFileInputFormat!
    job.setInputFormatClass(GraphInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BytesWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    TextOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);
}

From source file:etl.cmd.test.XFsTestCase.java

License:Apache License

/**
 * Set up the testcase.//  ww w.  ja va 2s  .c om
 *
 * @throws Exception thrown if the test case could no be set up.
 */
protected void setUp() throws Exception {
    super.setUp();
    Configuration conf = new XConfiguration();
    conf.setBoolean("oozie.service.HadoopAccessorService.kerberos.enabled",
            System.getProperty("oozie.test.hadoop.security", "simple").equals("kerberos"));
    conf.set("oozie.service.HadoopAccessorService.keytab.file", getKeytabFile());
    conf.set("oozie.service.HadoopAccessorService.kerberos.principal", getOoziePrincipal());
    conf.set("local.realm", getRealm());

    conf.set("oozie.service.HadoopAccessorService.hadoop.configurations", "*=hadoop-conf");
    conf.set("oozie.service.HadoopAccessorService.action.configurations", "*=action-conf");

    has = new HadoopAccessorService();
    has.init(conf);
    JobConf jobConf = has.createJobConf(getNameNodeUri());
    XConfiguration.copy(conf, jobConf);
    fileSystem = has.createFileSystem(getTestUser(), new URI(getNameNodeUri()), jobConf);
    fsTestDir = initFileSystem(fileSystem);
    if (System.getProperty("oozie.test.hadoop.minicluster2", "false").equals("true")) {
        fileSystem2 = has.createFileSystem(getTestUser(), new URI(getNameNode2Uri()), jobConf);
        fsTestDir2 = initFileSystem(fileSystem2);
    }
}

From source file:eu.scape_project.tpid.TomarPrepareInputdata.java

License:Apache License

/**
 * Start./* www . j  a v  a2  s . c  om*/
 *
 * @param args Command line arguments
 * @throws IOException
 * @throws ParseException
 */
private static void start(String[] args) throws IOException, ParseException {

    // hadoop configuration
    Configuration hadoopConf = new Configuration();
    // Command line interface
    config = new TpidCliConfig();

    CommandLineParser cmdParser = new PosixParser();
    GenericOptionsParser gop = new GenericOptionsParser(hadoopConf, args);

    TpidOptions tpidOptions = new TpidOptions();
    CommandLine cmd = cmdParser.parse(tpidOptions.options, gop.getRemainingArgs());
    if ((args.length == 0) || (cmd.hasOption(tpidOptions.HELP_OPT))) {
        tpidOptions.exit("Help", 0);
    } else {
        tpidOptions.initOptions(cmd, config);
    }

    // configuration properties
    if (config.getPropertiesFilePath() != null) {
        pu = new PropertyUtil(config.getPropertiesFilePath(), true);
    } else {
        pu = new PropertyUtil("/eu/scape_project/tpid/config.properties", false);
    }

    // cli parameter has priority over default configuration
    int cliParamNumPerInv = config.getNumItemsPerInvokation();
    int defaultNumPerInv = Integer.parseInt(pu.getProp("default.itemsperinvokation"));
    int numPerInv = (cliParamNumPerInv != 0) ? cliParamNumPerInv : defaultNumPerInv;
    // setting hadoop configuration parameters so that they can be used
    // during MapReduce
    hadoopConf.setInt("num_items_per_task", numPerInv);
    hadoopConf.set("output_file_suffix", pu.getProp("default.outputfilesuffix"));
    hadoopConf.set("scape_platform_invoke", pu.getProp("tomar.invoke.command"));
    hadoopConf.set("unpack_hdfs_path", pu.getProp("default.hdfsdir.unpacked"));
    hadoopConf.set("joboutput_hdfs_path", pu.getProp("default.hdfsdir.joboutput"));
    hadoopConf.set("tooloutput_hdfs_path", pu.getProp("default.hdfsdir.toolout"));
    hadoopConf.set("container_file_suffix", pu.getProp("containerfilesuffix"));
    hadoopConf.set("tomar_param_pattern", pu.getProp("tomar.param.pattern"));
    hadoopConf.setBoolean("pseudo_distributed", config.isPseudoDistributed());
    startHadoopJob(hadoopConf);
}

From source file:fi.tkk.ics.hadoop.bam.cli.CLIMRBAMPlugin.java

License:Open Source License

/** Should be called before accessing any of the protected data such as
 * samFormat.//from  w w w.j  av  a  2 s.  c  o  m
 */
@Override
public boolean cacheAndSetProperties(CmdLineParser parser) {
    if (!super.cacheAndSetProperties(parser))
        return false;

    if (!cacheSAMFormat(parser))
        return false;

    final Configuration conf = getConf();

    conf.setBoolean(AnySAMInputFormat.TRUST_EXTS_PROPERTY, !parser.getBoolean(noTrustExtsOpt));

    // Let the output format know if we're going to merge the output, so that
    // it doesn't write headers into the intermediate files.
    conf.setBoolean(KeyIgnoringAnySAMOutputFormat.WRITE_HEADER_PROPERTY, outPath == null);

    return true;
}

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.chipster.Summarize.java

License:Open Source License

@Override
protected int run(CmdLineParser parser) {

    final List<String> args = parser.getRemainingArgs();
    switch (args.size()) {
    case 0:/*  ww w  .jav a 2 s  .c o  m*/
        return missingArg("WORKDIR");
    case 1:
        return missingArg("LEVELS");
    case 2:
        return missingArg("INPATH");
    default:
        break;
    }
    if (!cacheAndSetProperties(parser))
        return 3;

    levels = args.get(1).split(",");
    for (String l : levels) {
        try {
            int lvl = Integer.parseInt(l);
            if (lvl > 0)
                continue;
            System.err.printf("summarize :: summary level '%d' is not positive!\n", lvl);
        } catch (NumberFormatException e) {
            System.err.printf("summarize :: summary level '%s' is not an integer!\n", l);
        }
        return 3;
    }

    wrkDir = new Path(args.get(0));
    final Path bam = new Path(args.get(2));

    final boolean sort = parser.getBoolean(sortOpt);

    final Configuration conf = getConf();

    conf.setBoolean(AnySAMInputFormat.TRUST_EXTS_PROPERTY, !parser.getBoolean(noTrustExtsOpt));

    // Used by Utils.getMergeableWorkFile() to name the output files.
    wrkFile = bam.getName();
    conf.set(Utils.WORK_FILENAME_PROPERTY, wrkFile);

    conf.setStrings(SummarizeReducer.SUMMARY_LEVELS_PROP, levels);

    try {
        try {
            // There's a lot of different Paths here, and it can get a bit
            // confusing. Here's how it works:
            //
            // - outPath is the output dir for the final merged output, given
            //   with the -o parameter.
            //
            // - wrkDir is the user-given path where the outputs of the
            //   reducers go.
            //
            // - mergedTmpDir (defined further below) is $wrkDir/sort.tmp: if
            //   we are sorting, the summaries output in the first Hadoop job
            //   are merged in there.
            //
            // - mainSortOutputDir is $wrkDir/sorted.tmp: getSortOutputDir()
            //   gives a per-level/strand directory under it, which is used by
            //   doSorting() and mergeOne(). This is necessary because we
            //   cannot have multiple Hadoop jobs outputting into the same
            //   directory at the same time, as explained in the comment in
            //   sortMerged().

            // Required for path ".", for example.
            wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir);

            mainSortOutputDir = sort ? new Path(wrkDir, "sorted.tmp") : null;

            if (!runSummary(bam))
                return 4;
        } catch (IOException e) {
            System.err.printf("summarize :: Summarizing failed: %s\n", e);
            return 4;
        }

        Path mergedTmpDir = null;
        try {
            if (sort) {
                mergedTmpDir = new Path(wrkDir, "sort.tmp");
                mergeOutputs(mergedTmpDir);

            } else if (outPath != null)
                mergeOutputs(outPath);

        } catch (IOException e) {
            System.err.printf("summarize :: Merging failed: %s\n", e);
            return 5;
        }

        if (sort) {
            if (!doSorting(mergedTmpDir))
                return 6;

            // Reset this since SummarySort uses it.
            conf.set(Utils.WORK_FILENAME_PROPERTY, wrkFile);

            tryDelete(mergedTmpDir);

            if (outPath != null)
                try {
                    sorted = true;
                    mergeOutputs(outPath);
                } catch (IOException e) {
                    System.err.printf("summarize :: Merging sorted output failed: %s\n", e);
                    return 7;
                }
            else {
                // Move the unmerged results out of the mainSortOutputDir
                // subdirectories to wrkDir.

                System.out.println("summarize :: Moving outputs from temporary directories...");
                t.start();

                try {
                    final FileSystem fs = wrkDir.getFileSystem(conf);
                    for (String lvl : levels) {
                        final FileStatus[] parts;

                        try {
                            parts = fs.globStatus(new Path(new Path(mainSortOutputDir, lvl + "[fr]"),
                                    "*-[0-9][0-9][0-9][0-9][0-9][0-9]"));
                        } catch (IOException e) {
                            System.err.printf("summarize :: Couldn't move level %s results: %s", lvl, e);
                            continue;
                        }

                        for (FileStatus part : parts) {
                            final Path path = part.getPath();
                            try {
                                fs.rename(path, new Path(wrkDir, path.getName()));
                            } catch (IOException e) {
                                System.err.printf("summarize :: Couldn't move '%s': %s", path, e);
                            }
                        }
                    }
                } catch (IOException e) {
                    System.err.printf("summarize :: Moving results failed: %s", e);
                }
                System.out.printf("summarize :: Moved in %d.%03d s.\n", t.stopS(), t.fms());
            }
            tryDelete(mainSortOutputDir);
        }
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    } catch (InterruptedException e) {
        throw new RuntimeException(e);
    }

    return 0;
}

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.VCFSort.java

License:Open Source License

@Override
protected int run(CmdLineParser parser) {
    final List<String> args = parser.getRemainingArgs();
    if (args.isEmpty()) {
        System.err.println("vcf-sort :: WORKDIR not given.");
        return 3;
    }/*from ww  w  .  j  a  v  a  2 s.  c o m*/
    if (args.size() == 1) {
        System.err.println("vcf-sort :: INPATH not given.");
        return 3;
    }
    if (!cacheAndSetProperties(parser))
        return 3;

    Path wrkDir = new Path(args.get(0));
    final Path inPath = new Path(args.get(1));

    final Configuration conf = getConf();

    VCFFormat vcfFormat = null;

    final String f = (String) parser.getOptionValue(formatOpt);
    if (f != null) {
        try {
            vcfFormat = VCFFormat.valueOf(f.toUpperCase(Locale.ENGLISH));
        } catch (IllegalArgumentException e) {
            System.err.printf("%s :: invalid format '%s'\n", getCommandName(), f);
            return 3;
        }
    }
    if (vcfFormat == null)
        vcfFormat = outPath == null ? VCFFormat.BCF : VCFFormat.inferFromFilePath(outPath);

    conf.setBoolean(VCFInputFormat.TRUST_EXTS_PROPERTY, !parser.getBoolean(noTrustExtsOpt));

    conf.setBoolean(KeyIgnoringVCFOutputFormat.WRITE_HEADER_PROPERTY, outPath == null);

    conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, vcfFormat.toString());

    // Used by Utils.getMergeableWorkFile() to name the output files.
    final String intermediateOutName = (outPath == null ? inPath : outPath).getName();
    conf.set(Utils.WORK_FILENAME_PROPERTY, intermediateOutName);

    conf.set(SortOutputFormat.INPUT_PATH_PROP, inPath.toString());

    final Timer t = new Timer();
    try {
        // Required for path ".", for example.
        wrkDir = wrkDir.getFileSystem(conf).makeQualified(wrkDir);

        Utils.configureSampling(wrkDir, intermediateOutName, conf);

        final Job job = new Job(conf);

        job.setJarByClass(VCFSort.class);
        job.setMapperClass(Mapper.class);
        job.setReducerClass(VCFSortReducer.class);

        job.setMapOutputKeyClass(LongWritable.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(VariantContextWritable.class);

        job.setInputFormatClass(VCFInputFormat.class);
        job.setOutputFormatClass(SortOutputFormat.class);

        FileInputFormat.addInputPath(job, inPath);
        FileOutputFormat.setOutputPath(job, wrkDir);

        job.setPartitionerClass(TotalOrderPartitioner.class);

        System.out.println("vcf-sort :: Sampling...");
        t.start();

        InputSampler.<LongWritable, VariantContextWritable>writePartitionFile(job,
                new InputSampler.RandomSampler<LongWritable, VariantContextWritable>(0.01, 10000,
                        Math.max(100, reduceTasks)));

        System.out.printf("vcf-sort :: Sampling complete in %d.%03d s.\n", t.stopS(), t.fms());

        job.submit();

        System.out.println("vcf-sort :: Waiting for job completion...");
        t.start();

        if (!job.waitForCompletion(verbose)) {
            System.err.println("vcf-sort :: Job failed.");
            return 4;
        }

        System.out.printf("vcf-sort :: Job complete in %d.%03d s.\n", t.stopS(), t.fms());

    } catch (IOException e) {
        System.err.printf("vcf-sort :: Hadoop error: %s\n", e);
        return 4;
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    } catch (InterruptedException e) {
        throw new RuntimeException(e);
    }

    if (outPath != null)
        try {
            System.out.println("vcf-sort :: Merging output...");
            t.start();

            final OutputStream outs = outPath.getFileSystem(conf).create(outPath);

            // First, place the VCF or BCF header.

            final WrapSeekable ins = WrapSeekable.openPath(conf, inPath);
            final VCFHeader header = VCFHeaderReader.readHeaderFrom(ins);
            ins.close();

            final VariantContextWriter writer;

            switch (vcfFormat) {
            case VCF:
                writer = VariantContextWriterFactory.create(new FilterOutputStream(outs) {
                    @Override
                    public void close() throws IOException {
                        this.out.flush();
                    }
                }, null, VariantContextWriterFactory.NO_OPTIONS);
                break;

            case BCF:
                writer = VariantContextWriterFactory
                        .create(new FilterOutputStream(new BlockCompressedOutputStream(outs, null)) {
                            @Override
                            public void close() throws IOException {
                                this.out.flush();
                            }
                        }, null, EnumSet.of(Options.FORCE_BCF));
                break;

            default:
                assert false;
                writer = null;
                break;
            }

            writer.writeHeader(header);
            writer.close();

            // Then, the actual VCF or BCF contents.
            Utils.mergeInto(outs, wrkDir, "", "", conf, "vcf-sort");

            // And if BCF, the BGZF terminator.
            if (vcfFormat == VCFFormat.BCF)
                outs.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);

            outs.close();

            System.out.printf("vcf-sort :: Merging complete in %d.%03d s.\n", t.stopS(), t.fms());

        } catch (IOException e) {
            System.err.printf("vcf-sort :: Output merging failed: %s\n", e);
            return 5;
        }
    return 0;
}

From source file:fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop.ExpressionHadoopModule.java

License:LGPL

/**
   * Create JobConf object for HTSeq-count.
   * @param context the task context/*from  w w  w. ja v  a2  s  .  co  m*/
   * @param alignmentsData alignment data
   * @param featureAnnotationData feature annotations data
   * @param gtfFormat true if the annotation file is in GTF format
   * @param genomeDescriptionData genome description data
   * @param genomicType genomic type
   * @param attributeId attributeId
   * @param splitAttributeValues split attribute values
   * @param stranded stranded mode
   * @param overlapMode overlap mode
   * @param removeAmbiguousCases true to remove ambiguous cases
   * @throws IOException if an error occurs while creating job
   * @throws BadBioEntryException if an entry of the annotation file is invalid
   * @throws EoulsanException if the job creating fails
   */
  private static Job createJobHTSeqCounter(final Configuration parentConf, final TaskContext context,
          final Data alignmentsData, final Data featureAnnotationData, final boolean gtfFormat,
          final Data genomeDescriptionData, final Data outData, final String genomicType,
          final String attributeId, final boolean splitAttributeValues, final StrandUsage stranded,
          final OverlapMode overlapMode, final boolean removeAmbiguousCases, final boolean tsamFormat)
          throws IOException, BadBioEntryException, EoulsanException {

      final Configuration jobConf = new Configuration(parentConf);

      // Get input DataFile
      DataFile inputDataFile = alignmentsData.getDataFile();

      if (inputDataFile == null) {
          throw new IOException("No input file found.");
      }

      final String dataFileSource;

      if (tsamFormat) {
          dataFileSource = StringUtils.filenameWithoutExtension(inputDataFile.getSource()) + TSAM_EXTENSION;
      } else {
          dataFileSource = inputDataFile.getSource();
      }

      // Set input path
      final Path inputPath = new Path(dataFileSource);

      // Get annotation DataFile
      final DataFile annotationDataFile = featureAnnotationData.getDataFile();

      // Get output file
      final DataFile outFile = outData.getDataFile();

      // Get temporary file
      final DataFile tmpFile = new DataFile(outFile.getParent(), outFile.getBasename() + ".tmp");

      getLogger().fine("sample: " + alignmentsData.getName());
      getLogger().fine("inputPath.getName(): " + inputPath.getName());
      getLogger().fine("annotationDataFile: " + annotationDataFile.getSource());
      getLogger().fine("outFile: " + outFile.getSource());
      getLogger().fine("tmpFile: " + tmpFile.getSource());

      jobConf.set("mapred.child.java.opts", "-Xmx1024m");

      // Set counter group
      jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP);

      // Set Genome description path
      final DataFile genomeDescDataFile = genomeDescriptionData.getDataFile();
      jobConf.set(GENOME_DESC_PATH_KEY, genomeDescDataFile.getSource());

      // Set the "stranded" parameter
      jobConf.set(HTSeqCountMapper.STRANDED_PARAM, stranded.getName());

      // Set the "overlap mode" parameter
      jobConf.set(HTSeqCountMapper.OVERLAP_MODE_PARAM, overlapMode.getName());

      // Set the "remove ambiguous cases" parameter
      jobConf.setBoolean(HTSeqCountMapper.REMOVE_AMBIGUOUS_CASES, removeAmbiguousCases);

      final Path featuresIndexPath = getAnnotationIndexSerializedPath(featureAnnotationData.getDataFile());

      getLogger().info("featuresIndexPath: " + featuresIndexPath);

      // Create serialized feature index
      if (!PathUtils.isFile(featuresIndexPath, jobConf)) {

          final Locker lock = createZookeeperLock(parentConf, context);

          lock.lock();

          createFeaturesIndex(context, annotationDataFile, gtfFormat, genomicType, attributeId,
                  splitAttributeValues, stranded, genomeDescDataFile, featuresIndexPath, jobConf);

          lock.unlock();
      }

      // Create the job and its name
      final Job job = Job.getInstance(jobConf,
              "Expression computation with htseq-count (" + alignmentsData.getName() + ", " + inputPath.getName()
                      + ", " + annotationDataFile.getSource() + ", " + genomicType + ", " + attributeId
                      + ", stranded: " + stranded + ", removeAmbiguousCases: " + removeAmbiguousCases + ")");

      // Set the path to the features index
      job.addCacheFile(featuresIndexPath.toUri());

      // Set the jar
      job.setJarByClass(ExpressionHadoopModule.class);

      // Set input path
      FileInputFormat.setInputPaths(job, inputPath);

      // Set input format
      job.setInputFormatClass(SAMInputFormat.class);

      // Set the mapper class
      job.setMapperClass(HTSeqCountMapper.class);

      // Set the combiner class
      job.setCombinerClass(HTSeqCountReducer.class);

      // Set the reducer class
      job.setReducerClass(HTSeqCountReducer.class);

      // Set the output format
      job.setOutputFormatClass(ExpressionOutputFormat.class);

      // Set the output key class
      job.setOutputKeyClass(Text.class);

      // Set the output value class
      job.setOutputValueClass(LongWritable.class);

      // Set output path
      FileOutputFormat.setOutputPath(job, new Path(tmpFile.getSource()));

      return job;
  }

From source file:gaffer.accumulo.bulkimport.BulkImportDriver.java

License:Apache License

public int run(String[] args) throws Exception {
    // Usage//from w  ww  .j  av  a 2 s .c o  m
    if (args.length < 3) {
        System.err.println("Usage: " + BulkImportDriver.class.getName()
                + " <inputpath> <output_path> <accumulo_properties_file>");
        return 1;
    }

    // Gets paths
    Path inputPath = new Path(args[0]);
    Path outputPath = new Path(args[1] + "/data_for_accumulo/");
    Path splitsFilePath = new Path(args[1] + "/splits_file");
    String accumuloPropertiesFile = args[2];

    // Hadoop configuration
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Connect to Accumulo
    AccumuloConfig accConf = new AccumuloConfig(accumuloPropertiesFile);
    Connector conn = Accumulo.connect(accConf);
    String tableName = accConf.getTable();

    // Check if the table exists
    if (!conn.tableOperations().exists(tableName)) {
        System.err.println("Table " + tableName + " does not exist - create the table before running this");
        return 1;
    }

    // Get the current splits from the table.
    // (This assumes that we have already created the table using <code>InitialiseTable</code>.)
    Collection<Text> splits = conn.tableOperations().getSplits(tableName);
    int numSplits = splits.size();
    System.out.println("Number of splits in table is " + numSplits);

    // Write current splits to a file (this is needed so that the following MapReduce
    // job can move them to the DistributedCache).
    IngestUtils.createSplitsFile(conn, tableName, fs, splitsFilePath);

    // Run MapReduce to output data suitable for bulk import to Accumulo
    // Conf and job
    conf.setBoolean("mapred.compress.map.output", true);
    conf.setClass("mapred.map.output.compression.codec", SnappyCodec.class, CompressionCodec.class);
    Job job = new Job(conf);
    job.setJarByClass(getClass());
    job.setJobName("Convert data to Accumulo format: input = " + inputPath + ", output = " + outputPath);

    // Input
    job.setInputFormatClass(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, inputPath);

    // Mapper
    job.setMapperClass(BulkImportMapper.class);
    job.setMapOutputKeyClass(Key.class);
    job.setMapOutputValueClass(Value.class);

    // Partitioner
    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitsFilePath.toString());

    // Reducer
    job.setReducerClass(BulkImportReducer.class);
    job.setOutputKeyClass(Key.class);
    job.setOutputValueClass(Value.class);
    job.setNumReduceTasks(numSplits + 1);

    // Output
    job.setOutputFormatClass(AccumuloFileOutputFormat.class);
    AccumuloFileOutputFormat.setOutputPath(job, outputPath);

    // Run job
    job.waitForCompletion(true);

    // Successful?
    if (!job.isSuccessful()) {
        System.err.println("Error running job");
        return 1;
    }

    return 0;
}

From source file:gaffer.accumulo.inputformat.example.ExampleDriver.java

License:Apache License

public int run(String[] args) throws Exception {
    // Usage/*  w ww .  j  av a 2 s . c om*/
    if (args.length != 6 && args.length != 7) {
        System.err.println(USAGE);
        return 1;
    }

    // Parse options
    Path outputPath = new Path(args[0]);
    String accumuloPropertiesFile = args[1];
    int numReduceTasks;
    try {
        numReduceTasks = Integer.parseInt(args[2]);
    } catch (NumberFormatException e) {
        System.err.println(USAGE);
        return 1;
    }
    Date startDate = null;
    Date endDate = null;
    boolean useTimeWindow = false;
    if (!args[3].equals("null") && !args[4].equals("null")) {
        try {
            startDate = DATE_FORMAT.parse(args[3]);
            endDate = DATE_FORMAT.parse(args[4]);
        } catch (ParseException e) {
            System.err.println("Error parsing dates: " + args[3] + " " + args[4] + " " + e.getMessage());
            return 1;
        }
        useTimeWindow = true;
    }
    boolean rollUpOverTimeAndVisibility = Boolean.parseBoolean(args[5]);
    boolean seedsSpecified = (args.length == 7);
    String seedsFile = "";
    if (seedsSpecified) {
        seedsFile = args[6];
    }

    // Hadoop configuration
    Configuration conf = getConf();

    // Connect to Accumulo, so we can check connection and check that the
    // table exists
    AccumuloConfig accConf = new AccumuloConfig(accumuloPropertiesFile);
    Connector conn = Accumulo.connect(accConf);
    String tableName = accConf.getTable();
    Authorizations authorizations = conn.securityOperations().getUserAuthorizations(accConf.getUserName());

    // Check if the table exists
    if (!conn.tableOperations().exists(tableName)) {
        System.err.println("Table " + tableName + " does not exist.");
        return 1;
    }

    // Create AccumuloBackedGraph and set view
    AccumuloBackedGraph graph = new AccumuloBackedGraph(conn, tableName);
    //    - Time window
    if (useTimeWindow) {
        graph.setTimeWindow(startDate, endDate);
    }
    //  - Roll up over time and visibility iterator
    graph.rollUpOverTimeAndVisibility(rollUpOverTimeAndVisibility);
    //    - If not specifying seeds then add iterator to avoid seeing the same edge multiple times
    if (seedsSpecified) {
        Set<TypeValue> typeValues = new HashSet<TypeValue>();
        BufferedReader reader = new BufferedReader(new FileReader(seedsFile));
        String line;
        while ((line = reader.readLine()) != null) {
            String[] tokens = line.split("\\|");
            if (tokens.length != 2) {
                System.err.println("Invalid line: " + line);
                continue;
            }
            String type = tokens[0];
            String value = tokens[1];
            typeValues.add(new TypeValue(type, value));
        }
        reader.close();
        // Use AccumuloBackedGraph to update the configuration with the view added above
        graph.setConfiguration(conf, typeValues, accConf);
    } else {
        // Use AccumuloBackedGraph to update the configuration with the view added above
        graph.setConfiguration(conf, accConf);
    }

    // Conf
    conf.setBoolean("mapred.compress.map.output", true);
    conf.setClass("mapred.map.output.compression.codec", SnappyCodec.class, CompressionCodec.class);

    // Job
    Job job = new Job(conf);
    job.setJarByClass(getClass());
    job.setJobName("Example MapReduce against Gaffer data in Accumulo format: input = " + tableName
            + ", output = " + outputPath);

    // Input format - use BatchScannerElementInputFormat if seeds have been specified (as that creates fewer
    // splits); otherwise use ElementInputFormat which is based on the standard AccumuloInputFormat.
    if (seedsSpecified) {
        job.setInputFormatClass(BatchScannerElementInputFormat.class);
    } else {
        job.setInputFormatClass(ElementInputFormat.class);
    }

    // Mapper
    job.setMapperClass(ExampleMapper.class);
    job.setMapOutputKeyClass(GraphElement.class);
    job.setMapOutputValueClass(SetOfStatistics.class);

    // Reducer - use default IdentityReducer for this example
    job.setOutputKeyClass(GraphElement.class);
    job.setOutputValueClass(SetOfStatistics.class);
    job.setNumReduceTasks(numReduceTasks);

    // Output
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, outputPath);

    System.out.println("Running MapReduce job over:");
    System.out.println("\tTable: " + accConf.getTable());
    System.out.println("\tUser: " + accConf.getUserName());
    System.out.println("\tAuths: " + authorizations);
    if (useTimeWindow) {
        System.out.println("\tFilter by time: start time is " + DATE_FORMAT.format(startDate) + ", "
                + DATE_FORMAT.format(endDate));
    } else {
        System.out.println("\tFilter by time is off");
    }
    System.out.println("\tRoll up over time and visibility: " + rollUpOverTimeAndVisibility);

    // Run job
    job.waitForCompletion(true);

    // Successful?
    if (!job.isSuccessful()) {
        System.err.println("Error running job");
        return 1;
    }

    return 0;
}