Example usage for org.apache.hadoop.conf Configuration setLong

List of usage examples for org.apache.hadoop.conf Configuration setLong

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setLong.

Prototype

public void setLong(String name, long value) 

Source Link

Document

Set the value of the name property to a long.

Usage

From source file:com.splicemachine.test.SpliceTestPlatformConfig.java

License:Apache License

public static Configuration create(String hbaseRootDirUri, Integer masterPort, Integer masterInfoPort,
        Integer regionServerPort, Integer regionServerInfoPort, Integer derbyPort, boolean failTasksRandomly) {

    Configuration config = HConfiguration.unwrapDelegate();

    config.set(SQLConfiguration.STORAGE_FACTORY_HOME, hbaseRootDirUri);

    ////w w  w  . ja  v  a 2s . c  o m
    // Coprocessors
    //
    config.set("hbase.coprocessor.regionserver.classes", getRegionServerCoprocessorsAsString());
    config.set("hbase.coprocessor.region.classes", getRegionCoprocessorsAsString());
    config.set("hbase.coprocessor.master.classes", getMasterCoprocessorsAsString());

    //
    // Networking
    //
    config.set("hbase.zookeeper.quorum", "127.0.0.1:2181");
    config.setInt("hbase.master.port", masterPort);
    config.setInt("hbase.master.info.port", masterInfoPort);
    config.setInt("hbase.regionserver.port", regionServerPort);
    config.setInt("hbase.regionserver.info.port", regionServerInfoPort);
    config.setInt("hbase.master.jmx.port", HConfiguration.DEFAULT_JMX_BIND_PORT); // this is set because the HBase master and regionserver are running on the same machine and in the same JVM
    config.setInt(SQLConfiguration.NETWORK_BIND_PORT, derbyPort);
    config.setClass(DefaultStoreEngine.DEFAULT_COMPACTOR_CLASS_KEY, SpliceDefaultCompactor.class,
            Compactor.class);
    // config.setClass(ConsistencyControlUtils.MVCC_IMPL, SIMultiVersionConsistencyControl.class, ConsistencyControl.class);
    config.setClass(DefaultStoreEngine.DEFAULT_COMPACTION_POLICY_CLASS_KEY, SpliceDefaultCompactionPolicy.class,
            CompactionPolicy.class);

    //
    // Networking -- interfaces
    //
    // force use of loop back interface on MacOSX, else don't set it
    //        if (System.getProperty("os.name").contains("Mac") ) {
    //            String interfaceName = "lo0";
    //            config.set("hbase.zookeeper.dns.interface", interfaceName);
    //            config.set("hbase.master.dns.interface", interfaceName);
    //            config.set("hbase.regionserver.dns.interface", interfaceName);
    //        }

    //
    // File System
    //
    config.set("fs.defaultFS", "file:///"); // MapR Hack, tells it local filesystem // fs.default.name is deprecated
    config.set(FileSystem.FS_DEFAULT_NAME_KEY, "file:///");
    config.setDouble("yarn.nodemanager.resource.io-spindles", 2.0);
    config.set("fs.default.name", "file:///");
    config.set("yarn.nodemanager.container-executor.class",
            "org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor");

    // Must allow Cygwin instance to config its own rootURI
    if (!"CYGWIN".equals(hbaseRootDirUri)) {
        config.set("hbase.rootdir", hbaseRootDirUri);
    }

    //
    // Threads, timeouts
    //
    config.setLong("hbase.rpc.timeout", MINUTES.toMillis(2));
    config.setLong("hbase.client.scanner.timeout.period", MINUTES.toMillis(2)); // hbase.regionserver.lease.period is deprecated
    config.setLong("hbase.client.operation.timeout", MINUTES.toMillis(2));
    config.setLong("hbase.regionserver.handler.count", 200);
    config.setLong("hbase.regionserver.msginterval", 1000);
    config.setLong("hbase.master.event.waiting.time", 20);
    config.setLong("hbase.master.lease.thread.wakefrequency", SECONDS.toMillis(3));
    //        config.setBoolean("hbase.master.loadbalance.bytable",true);
    config.setInt("hbase.balancer.period", 5000);

    config.setLong("hbase.server.thread.wakefrequency", SECONDS.toMillis(1));
    config.setLong("hbase.client.pause", 100);

    //
    // Compaction Controls
    //
    config.setLong("hbase.hstore.compaction.min", 5); // min number of eligible files before we compact
    config.setLong("hbase.hstore.compaction.max", 10); // max files to be selected for a single minor compaction
    config.setLong("hbase.hstore.compaction.min.size", 16 * MiB); // store files smaller than this will always be eligible for minor compaction.  HFiles this size or larger are evaluated by hbase.hstore.compaction.ratio to determine if they are eligible
    config.setLong("hbase.hstore.compaction.max.size", 248 * MiB); // store files larger than this will be excluded from compaction
    config.setFloat("hbase.hstore.compaction.ratio", 1.25f); // default is 1.2f, at one point we had this set to 0.25f and 25f (which was likely a typo)

    //
    // Memstore, store files, splits
    //
    config.setLong(HConstants.HREGION_MAX_FILESIZE, 32 * MiB); // hbase.hregion.max.filesize
    config.setLong("hbase.hregion.memstore.flush.size", 128 * MiB); // was 512 MiB
    config.setLong("hbase.hregion.memstore.block.multiplier", 4);
    config.setFloat("hbase.regionserver.global.memstore.size", 0.25f); // set mem store to 25% of heap
    config.setLong("hbase.hstore.blockingStoreFiles", 20);
    //        config.set("hbase.regionserver.region.split.policy", "org.apache.hadoop.hbase.regionserver.ConstantSizeRegionSplitPolicy"); // change default split policy.  this makes more sense for a standalone/single regionserver

    // Support SI
    //config.setClass(HConstants.MVCC_IMPL, SIMultiVersionConsistencyControl.class, ConsistencyControl.class);

    //
    // HFile
    //
    config.setInt("hfile.index.block.max.size", 16 * 1024); // 16KiB
    config.setFloat("hfile.block.cache.size", 0.25f); // set block cache to 25% of heap
    config.setFloat("io.hfile.bloom.error.rate", (float) 0.005);
    config.setBoolean(CacheConfig.CACHE_BLOOM_BLOCKS_ON_WRITE_KEY, true); // hfile.block.bloom.cacheonwrite
    //config.set("hbase.master.hfilecleaner.plugins", getHFileCleanerAsString());
    config.set("hbase.master.hfilecleaner.plugins", getHFileCleanerAsString());
    //
    // Misc
    //
    config.set("hbase.cluster.distributed", "true"); // don't start zookeeper for us
    config.set("hbase.master.distributed.log.splitting", "false"); // TODO: explain why we are setting this

    // AWS Credentials for test...
    //

    config.set(ACCESS_KEY, "AKIAJ6HBMCK5ALHVBFPQ");
    config.set(SECRET_KEY, "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");

    //
    // Splice
    //

    config.setLong("splice.ddl.drainingWait.maximum", SECONDS.toMillis(15)); // wait 15 seconds before bailing on bad ddl statements
    config.setLong("splice.ddl.maxWaitSeconds", 120000);
    //
    // Snapshots
    //
    config.setBoolean("hbase.snapshot.enabled", true);

    HConfiguration.reloadConfiguration(config);
    return HConfiguration.unwrapDelegate();
}

From source file:com.taobao.adfs.distributed.DistributedDataTest.java

License:Apache License

@BeforeClass
static public void setupAfterClass() throws Throwable {
    Utilities.configureLog4j(null, "distributed.logger.conf.", Level.DEBUG);
    Configuration conf = new Configuration(false);
    conf.set("distributed.data.path", "target/test" + DistributedDataTest.class.getSimpleName());
    conf.setLong("distributed.data.delete.check.interval.time", 1);
    conf.set("distributed.data.format", "true");
    exampleData = new ExampleData(conf);
    exampleData.format();//  w w w .  j a  v  a 2s  .c  o m
}

From source file:com.teradata.benchto.generator.HiveTypesGenerator.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(/*from  w w w  . j a va2  s  . co  m*/
            Option.builder("format").required().hasArg().desc("file format (orc, parquet or text)").build());
    options.addOption(Option.builder("type").required().hasArg().desc(
            "hive type to be generated (bigint, int, boolean, double, binary, date, timestamp, string, decimal or varchar)")
            .build());
    options.addOption(Option.builder("rows").required().hasArg().desc("total row count").build());
    options.addOption(Option.builder("mappers").required().hasArg().desc("total mappers count").build());
    options.addOption(Option.builder("path").hasArg()
            .desc("base path for generating files, default is: /benchmarks/benchto/types").build());
    options.addOption(Option.builder("regex").numberOfArgs(3)
            .desc("generate varchars from regex pattern, arguments are: pattern, min length, max length")
            .build());

    CommandLine line;
    String format;
    String hiveType;
    long numberOfRows;
    long numberOfFiles;
    String basePath;
    Optional<String> regexPattern = Optional.absent();
    Optional<Integer> regexMinLength = Optional.absent();
    Optional<Integer> regexMaxLength = Optional.absent();
    try {
        line = new DefaultParser().parse(options, args);
        format = line.getOptionValue("format");
        hiveType = line.getOptionValue("type");
        numberOfRows = parseLong(line.getOptionValue("rows"));
        numberOfFiles = parseLong(line.getOptionValue("mappers"));
        basePath = line.getOptionValue("path", "/benchmarks/benchto/types");
        if (line.hasOption("regex")) {
            String[] values = line.getOptionValues("regex");
            regexPattern = Optional.of(values[0]);
            regexMinLength = Optional.of(parseInt(values[1]));
            regexMaxLength = Optional.of(parseInt(values[2]));
        }
    } catch (Exception e) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("benchto-generator", options);
        throw e;
    }

    String jobName = format("GenerateData-%s-%s-%d", format, hiveType, numberOfRows);
    Path outputDir = new Path(format("%s/%s-%s/%d", basePath, format, hiveType, numberOfRows));
    Class<? extends OutputFormat> outputFormatClass = getOutputFormatClass(format);

    LOG.info("Generating " + numberOfRows + " " + hiveType + "s, directory: " + outputDir
            + ", number of files: " + numberOfFiles);

    Configuration configuration = new Configuration();
    configuration.set(FORMAT_PROPERTY_NAME, format);
    configuration.set(HIVE_TYPE_PROPERTY_NAME, hiveType);
    configuration.setLong(NUM_ROWS_PROPERTY_NAME, numberOfRows);
    configuration.setLong(NUM_MAPS, numberOfFiles);
    if (regexPattern.isPresent()) {
        configuration.set(REGEX_PATTERN, regexPattern.get());
        configuration.setInt(REGEX_MIN_LENGTH, regexMinLength.get());
        configuration.setInt(REGEX_MAX_LENGTH, regexMaxLength.get());
    }

    Job generatorJob = Job.getInstance(configuration, jobName);
    FileOutputFormat.setOutputPath(generatorJob, outputDir);
    ParquetOutputFormat.setWriteSupportClass(generatorJob, DataWritableWriteSupport.class);
    generatorJob.setJarByClass(HiveTypesGenerator.class);
    generatorJob.setMapperClass(HiveTypesMapper.class);
    generatorJob.setNumReduceTasks(0);
    generatorJob.setOutputKeyClass(NullWritable.class);
    generatorJob.setOutputValueClass(Writable.class);
    generatorJob.setInputFormatClass(CounterInputFormat.class);
    generatorJob.setOutputFormatClass(outputFormatClass);

    return generatorJob.waitForCompletion(true) ? 0 : 1;
}

From source file:com.twitter.algebra.nmf.NMFCommon.java

License:Apache License

public static void setNumberOfMapSlots(Configuration conf, FileSystem fs, Path[] paths, String joblabel) {
    if (conf.get(MAPSPLOTS) == null)
        return;//www.j  a  v  a  2 s  .c  o  m
    int mapSlots = conf.getInt(MAPSPLOTS, 1);
    mapSlots = conf.getInt(MAPSPLOTS + "." + joblabel, mapSlots);
    long du = 0;
    try {
        for (Path path : paths)
            du += MapDir.du(path, fs);
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }
    long splitSize = du / mapSlots;
    log.info("du: " + du + " mapSlots: " + mapSlots + " splitSize: " + splitSize);
    long minSplitSize = (long) (splitSize * 0.9);
    long maxSplitSize = Math.max((long) (splitSize * 1.1), 1024 * 1024);
    conf.setLong("mapred.min.split.size", minSplitSize);
    conf.setLong("mapreduce.min.split.size", minSplitSize);
    conf.setLong("mapred.max.split.size", maxSplitSize);
    conf.setLong("mapreduce.max.split.size", maxSplitSize);
}

From source file:com.twitter.hraven.etl.JobFileProcessor.java

License:Apache License

/**
 * @param conf/*from w w w  . ja  v a2 s .  c om*/
 *          used to connect to HBAse
 * @param cluster
 *          for which we are processing
 * @param reprocess
 *          Reprocess those records that may have been processed already.
 *          Otherwise successfully processed job files are skipped.
 * @param reprocessOnly
 *          process only those raw records that were marked to be reprocessed.
 *          When true then reprocess argument is ignored and is assumed to be
 *          true.
 * @param batchSize
 *          the total number of jobs to process in a batch (a MR job scanning
 *          these many records in the raw table).
 * @param minJobId
 *          used to start the scan. If null then there is no min limit on
 *          JobId.
 * @param maxJobId
 *          used to end the scan (inclusive). If null then there is no max
 *          limit on jobId.
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 * @throws ExecutionException
 * @throws RowKeyParseException
 */
private List<JobRunner> getJobRunners(Configuration conf, String cluster, boolean reprocess, int batchSize,
        String minJobId, String maxJobId)
        throws IOException, InterruptedException, ClassNotFoundException, RowKeyParseException {
    List<JobRunner> jobRunners = new LinkedList<JobRunner>();

    JobHistoryRawService jobHistoryRawService = new JobHistoryRawService(conf);
    try {

        // Bind all MR jobs together with one runID.
        long now = System.currentTimeMillis();
        conf.setLong(Constants.MR_RUN_CONF_KEY, now);

        List<Scan> scanList = jobHistoryRawService.getHistoryRawTableScans(cluster, minJobId, maxJobId,
                reprocess, batchSize);

        for (Scan scan : scanList) {
            Job job = getProcessingJob(conf, scan, scanList.size());

            JobRunner jobRunner = new JobRunner(job, null);
            jobRunners.add(jobRunner);
        }

    } finally {
        IOException caught = null;
        try {
            jobHistoryRawService.close();
        } catch (IOException ioe) {
            caught = ioe;
        }

        if (caught != null) {
            throw caught;
        }
    }
    return jobRunners;

}

From source file:com.twitter.hraven.etl.JobFileRawLoader.java

License:Apache License

/**
 * @param myHBaseConf/*w w w .  j a  v a  2 s. co  m*/
 *          used to contact HBase and to run jobs against. Should be an HBase
 *          configuration.
 * @param cluster
 *          for which to process records.
 * @param processFileSubstring
 *          return rows where the process file path contains this string. If
 *          <code>null</code> or empty string, then no filtering is applied.
 * @param forceReprocess
 *          whether all jobs for which a file is loaded needs to be
 *          reprocessed.
 * @return whether all job files for all processRecords were properly
 *         processed.
 * @throws IOException
 * @throws ClassNotFoundException
 *           when problems occur setting up the job.
 * @throws InterruptedException
 */
private boolean processRecordsFromHBase(Configuration myHBaseConf, String cluster, String processFileSubstring,
        boolean forceReprocess) throws IOException, InterruptedException, ClassNotFoundException {

    int failures = 0;

    ProcessRecordService processRecordService = new ProcessRecordService(myHBaseConf);
    // Grab all records.
    List<ProcessRecord> processRecords = processRecordService.getProcessRecords(cluster, PREPROCESSED,
            Integer.MAX_VALUE, processFileSubstring);
    try {

        LOG.info("ProcessRecords for " + cluster + ": " + processRecords.size());

        // Bind all MR jobs together with one runID.
        long now = System.currentTimeMillis();
        myHBaseConf.setLong(Constants.MR_RUN_CONF_KEY, now);

        myHBaseConf.setBoolean(Constants.FORCE_REPROCESS_CONF_KEY, forceReprocess);

        // Iterate over 0 based list in reverse order
        for (int j = processRecords.size() - 1; j >= 0; j--) {
            ProcessRecord processRecord = processRecords.get(j);

            LOG.info("Processing " + processRecord);

            boolean success = runRawLoaderJob(myHBaseConf, processRecord.getProcessFile(),
                    processRecords.size());
            // Bail out on first failure.
            if (success) {
                processRecordService.setProcessState(processRecord, ProcessState.LOADED);
            } else {
                failures++;
            }

        }
    } finally {
        processRecordService.close();
    }

    return (failures == 0);
}

From source file:com.wandisco.s3hdfs.rewrite.filter.TestBase.java

License:Apache License

/**
 * @throws java.lang.Exception/*  w ww.  j  a v a 2 s  .  c om*/
 */
@Before
public void setUp() throws Exception {
    Configuration conf = new HdfsConfiguration(new S3HdfsConfiguration());
    conf.setInt(S3_PROXY_PORT_KEY, PROXY_PORT);
    conf.setBoolean(DFS_WEBHDFS_ENABLED_KEY, true);
    conf.setInt(DFS_DATANODE_SCAN_PERIOD_HOURS_KEY, 100);
    conf.setLong(DFS_BLOCK_SIZE_KEY, 1024);
    conf.setLong(DFS_NAMENODE_MIN_BLOCK_SIZE_KEY, 512);

    // ^ has to be a multiple of 512
    FsPermission.setUMask(conf, FsPermission.createImmutable((short) 0));
    // ^ eliminate the UMask in HDFS to remove perm denied exceptions in s3Dir
    hostName = conf.get(S3_SERVICE_HOSTNAME_KEY);
    System.out.println("S3HDFS ServiceHostName: " + hostName);

    s3Directory = conf.get(S3_DIRECTORY_KEY);
    cluster = new MiniDFSCluster.Builder(conf).nameNodeHttpPort(HTTP_PORT).numDataNodes(3).build();
    cluster.waitActive();
    hdfs = cluster.getFileSystem();

    //initialize s3 directory
    Path s3Path = new Path(s3Directory);
    assertTrue(hdfs.mkdirs(s3Path));

    testUtil = new S3HdfsTestUtil(hdfs, s3Directory);
    s3Service = testUtil.configureS3Service(hostName, PROXY_PORT);
}

From source file:com.wipro.ats.bdre.datagen.mr.Driver.java

License:Apache License

/**
 * @param args the cli arguments/*from w w w. j a va 2s. c  o  m*/
 */
@Override
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = getConf();
    GetGeneralConfig generalConfig = new GetGeneralConfig();
    GeneralConfig gc = generalConfig.byConigGroupAndKey("imconfig", "common.default-fs-name");
    conf.set("fs.defaultFS", gc.getDefaultVal());

    String processId = args[0];
    Path outputDir = new Path(ResolvePath.replaceVars(args[1]));

    Properties dataProps = Config.getDataProperties(processId);
    Properties tableProps = Config.getTableProperties(processId);

    TableUtil tableUtil = new TableUtil();
    Table table = tableUtil.formTableFromConfig(processId);
    FileSystem fs = FileSystem.get(conf);
    LOGGER.info("Default FS =" + conf.get("fs.defaultFS"));
    //set in the conf for mappers to use
    conf.set(Config.SEPARATOR_KEY, tableProps.getProperty("separator"));
    conf.set(Config.PID_KEY, processId);
    conf.setLong(Config.NUM_ROWS_KEY, Long.parseLong(dataProps.getProperty("numRows")));
    conf.setInt(Config.NUM_SPLITS_KEY, Integer.parseInt(dataProps.getProperty("numSplits")));

    Job job = Job.getInstance(conf);
    Path mrOutputPath = new Path(outputDir.toString() + "/MROUT/" + table.getTableName());

    FileOutputFormat.setOutputPath(job, mrOutputPath);
    job.setJobName("Datagen-" + table.getTableName());
    job.setJarByClass(Driver.class);
    job.setMapperClass(RecordGenMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(RangeInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.waitForCompletion(true);

    //merge and create a single file

    Path srcDir = mrOutputPath;
    Path destFile = new Path(outputDir.toString() + "/" + table.getTableName());
    FileUtil.copyMerge(fs, srcDir, fs, destFile, true, conf, "");

    //Return file info oozie params
    RegisterFileInfo registerFileInfo = new RegisterFileInfo();
    registerFileInfo.setBatchId(null);
    registerFileInfo.setCreationTs(new Timestamp(new Date().getTime()));
    registerFileInfo.setFileHash("0");
    registerFileInfo.setFileSize(0L);
    registerFileInfo.setPath(destFile.toString());
    registerFileInfo.setSubProcessId(Integer.parseInt(processId));
    OozieUtil oozieUtil = new OozieUtil();
    oozieUtil.persistBeanData(registerFileInfo, false);
    return 0;
}

From source file:com.yahoo.glimmer.indexing.generator.TripleIndexGenerator.java

License:Open Source License

public int run(String[] args) throws Exception {
    SimpleJSAP jsap = new SimpleJSAP(TripleIndexGenerator.class.getName(),
            "Generates a keyword index from RDF data.",
            new Parameter[] {
                    new Switch(NO_CONTEXTS_ARG, 'C', "withoutContexts",
                            "Don't process the contexts for each tuple."),
                    new FlaggedOption(METHOD_ARG, JSAP.STRING_PARSER, "horizontal", JSAP.REQUIRED, 'm',
                            METHOD_ARG, "horizontal or vertical."),
                    new FlaggedOption(PREDICATES_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
                            'p', PREDICATES_ARG, "Subset of the properties to be indexed."),
                    new FlaggedOption(RESOURCE_PREFIX_ARG, JSAP.STRING_PARSER, "@", JSAP.NOT_REQUIRED, 'r',
                            RESOURCE_PREFIX_ARG,
                            "Prefix to add to object resource hash values when indexing. Stops queries for numbers matching resource hash values. Default is '@'"),

                    new UnflaggedOption("input", JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the input data."),
                    new UnflaggedOption(NUMBER_OF_DOCS_ARG, JSAP.LONG_PARSER, JSAP.REQUIRED,
                            "Number of documents to index"),
                    new UnflaggedOption("output", JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the output."),
                    new UnflaggedOption(RESOURCES_HASH_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location of the resources hash file."), });

    JSAPResult jsapResult = jsap.parse(args);

    // check whether the command line was valid, and if it wasn't,
    // display usage information and exit.
    if (!jsapResult.success()) {
        System.err.println();//w  w  w . j a  v a  2  s.c o m
        System.err.println("Usage: java " + TripleIndexGenerator.class.getName());
        System.err.println("                " + jsap.getUsage());
        System.err.println();
        System.exit(1);
    }

    Job job = Job.getInstance(getConf());
    job.setJarByClass(TripleIndexGenerator.class);
    job.setJobName("TripleIndexGenerator" + System.currentTimeMillis());

    FileInputFormat.setInputPaths(job, new Path(jsapResult.getString("input")));
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(DocumentMapper.class);
    job.setMapOutputKeyClass(TermKey.class);
    job.setMapOutputValueClass(TermValue.class);

    job.setPartitionerClass(TermKey.FirstPartitioner.class);
    job.setGroupingComparatorClass(TermKey.FirstGroupingComparator.class);

    job.setReducerClass(TermReduce.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IndexRecordWriterValue.class);
    job.setOutputFormatClass(IndexRecordWriter.OutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(jsapResult.getString("output")));

    Configuration conf = job.getConfiguration();

    conf.setClass("mapred.output.key.comparator.class", TermKey.Comparator.class, WritableComparator.class);
    conf.set("mapreduce.user.classpath.first", "true");

    long numDocs = jsapResult.getLong(NUMBER_OF_DOCS_ARG);
    conf.setLong(NUMBER_OF_DOCUMENTS, numDocs);
    // Set this in a attempt to get around the 2GB of ram task limit on our cluster.
    // Setting this in the hope of fixing Direct buffer memory errors
    conf.setInt(INDEX_WRITER_CACHE_SIZE, 1024 * 1024);

    conf.set(OUTPUT_DIR, jsapResult.getString("output"));

    boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false);
    if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_HORIZONTAL)) {
        HorizontalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG),
                jsapResult.getString(RESOURCE_PREFIX_ARG));
    } else if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_VERTICAL)) {
        if (!jsapResult.contains(PREDICATES_ARG)) {
            throw new IllegalArgumentException("When '" + METHOD_ARG + "' is '" + METHOD_ARG_VALUE_VERTICAL
                    + "' you have to give a predicates file too.");
        }
        VerticalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG),
                jsapResult.getString(RESOURCE_PREFIX_ARG), jsapResult.getString(PREDICATES_ARG));
    } else {
        throw new IllegalArgumentException(METHOD_ARG + " should be '" + METHOD_ARG_VALUE_HORIZONTAL + "' or '"
                + METHOD_ARG_VALUE_VERTICAL + "'");
    }

    conf.setInt("mapreduce.input.linerecordreader.line.maxlength", 1024 * 1024);

    boolean success = job.waitForCompletion(true);

    return success ? 0 : 1;
}

From source file:de.l3s.content.timex.extracting.ClueWeb09Timex.java

License:Apache License

/**
 * Runs this tool.//ww w  . java  2s. c o  m
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
            OptionBuilder.withArgName("input").hasArg().withDescription("input path").create(INPUT_OPTION));

    options.addOption(
            OptionBuilder.withArgName("output").hasArg().withDescription("output path").create(OUTPUT_OPTION));

    options.addOption(OptionBuilder.withArgName("column").hasArg()
            .withDescription("column to store row data into (must exist)").create(COLUMN));
    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    cmdline = parser.parse(options, args);

    if (!cmdline.hasOption(INPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    if (!cmdline.hasOption(OUTPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String input = cmdline.getOptionValue(INPUT_OPTION);

    String output = cmdline.getOptionValue(OUTPUT_OPTION);

    //      String column = cmdline.getOptionValue(COLUMN);

    LOG.info("Tool name: " + ClueWeb09Timex.class.getSimpleName());
    LOG.info(" - input: " + input);
    LOG.info(" - output: " + output);
    //      LOG.info(" - column: " + column);

    Configuration conf = HBaseConfiguration.create();
    conf.set("hbase.zookeeper.quorum", "node05.ib,node03.ib,node04.ib");
    conf.set("hbase.zookeeper.property.clientPort", "2181");
    conf.set("hbase.master", "master.ib");

    //      conf.set("conf.column", column);

    long milliSeconds = 10000 * 60 * 60; //x10 default
    conf.setLong("mapred.task.timeout", milliSeconds);

    Job job = Job.getInstance(conf, ClueWeb09Timex.class.getSimpleName()
            + " time-confident extraction + annotation + HBase import: " + input);

    //Configuration conf = new Configuration();
    //Job job = Job.getInstance(conf, "web pages count");
    job.setJarByClass(ClueWeb09Timex.class);
    job.setNumReduceTasks(0);

    job.setInputFormatClass(ClueWeb09InputFormat.class);
    job.setOutputFormatClass(TableOutputFormat.class);
    job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, output);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(Writable.class);
    job.setMapperClass(TMapper.class);
    //job.setReducerClass(IntSumReducer.class);
    //job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(input));
    //FileOutputFormat.setOutputPath(job, new Path(output));
    job.waitForCompletion(true);

    return 0;
}